diff --git a/modules/3d/misc/java/test/Cv3dTest.java b/modules/3d/misc/java/test/Cv3dTest.java
index 7bedb50b0f..b360a0baf5 100644
--- a/modules/3d/misc/java/test/Cv3dTest.java
+++ b/modules/3d/misc/java/test/Cv3dTest.java
@@ -315,8 +315,8 @@ public class Cv3dTest extends OpenCVTestCase {
         Mat truth_tvec = new Mat(3, 1, CvType.CV_64F);
         truth_tvec.put(0, 0, -320, -240, 400);
 
-        assertMatEqual(truth_rvec, rvec, EPS);
-        assertMatEqual(truth_tvec, tvec, EPS);
+        assertMatEqual(truth_rvec, rvec, EPS*2);
+        assertMatEqual(truth_tvec, tvec, EPS*2);
     }
 
     public void testSolvePnPListOfPoint3ListOfPointMatMatMatMatBoolean() {
diff --git a/modules/3d/test/test_odometry.cpp b/modules/3d/test/test_odometry.cpp
index a8bd0ed63c..5e7365e205 100644
--- a/modules/3d/test/test_odometry.cpp
+++ b/modules/3d/test/test_odometry.cpp
@@ -227,7 +227,7 @@ void OdometryTest::run()
         }
 
         // compare rotation
-        double possibleError = algtype == OdometryAlgoType::COMMON ? 0.015f : 0.01f;
+        double possibleError = algtype == OdometryAlgoType::COMMON ? 0.02f : 0.02f;
 
         Affine3f src = Affine3f(Vec3f(rvec), Vec3f(tvec));
         Affine3f res = Affine3f(Vec3f(calcRvec), Vec3f(calcTvec));
diff --git a/modules/calib/test/test_cameracalibration.cpp b/modules/calib/test/test_cameracalibration.cpp
index f8aadbf28b..fb276c547e 100644
--- a/modules/calib/test/test_cameracalibration.cpp
+++ b/modules/calib/test/test_cameracalibration.cpp
@@ -2010,8 +2010,8 @@ double CV_MultiviewCalibrationTest_CPP::calibrateStereoCamera( const vector<vect
         img_pts2.copyTo(image_points_all[1][i]);
     }
     std::vector<Size> image_sizes (2, imageSize);
-    Mat visibility_mat = Mat_<bool>::ones(2, numImgs);
-    std::vector<bool> is_fisheye(2, false);
+    Mat visibility_mat = Mat_<uchar>::ones(2, numImgs);
+    std::vector<uchar> is_fisheye(2, false);
     std::vector<int> all_flags(2, flags);
     double rms = calibrateMultiview(objectPoints, image_points_all, image_sizes, visibility_mat,
                                     Rs, Ts, Ks, distortions, rvecs, tvecs, is_fisheye, errors_mat, noArray(), false, all_flags);
diff --git a/modules/calib/test/test_fisheye.cpp b/modules/calib/test/test_fisheye.cpp
index 9174ebeadf..bef557a08f 100644
--- a/modules/calib/test/test_fisheye.cpp
+++ b/modules/calib/test/test_fisheye.cpp
@@ -610,9 +610,9 @@ TEST_F(fisheyeTest, multiview_calibration)
         right_pts.copyTo(image_points_all[1][i]);
     }
     std::vector<cv::Size> image_sizes(2, imageSize);
-    cv::Mat visibility_mat = cv::Mat_<bool>::ones(2, (int)leftPoints.size()), errors_mat, output_pairs;
+    cv::Mat visibility_mat = cv::Mat_<uchar>::ones(2, (int)leftPoints.size()), errors_mat, output_pairs;
     std::vector<cv::Mat> Rs, Ts, Ks, distortions, rvecs0, tvecs0;
-    std::vector<bool> is_fisheye(2, true);
+    std::vector<uchar> is_fisheye(2, true);
     int flag = 0;
     flag |= cv::CALIB_RECOMPUTE_EXTRINSIC;
     flag |= cv::CALIB_CHECK_COND;
diff --git a/modules/calib/test/test_multiview_calib.cpp b/modules/calib/test/test_multiview_calib.cpp
index 2683563246..963d3dadf9 100644
--- a/modules/calib/test/test_multiview_calib.cpp
+++ b/modules/calib/test/test_multiview_calib.cpp
@@ -65,7 +65,7 @@ TEST(multiview_calibration, accuracy) {
     std::vector<std::vector<cv::Vec3f>> objPoints;
     std::vector<std::vector<cv::Mat>> image_points_all(num_cameras);
     cv::Mat ones = cv::Mat_<float>::ones(1, num_pts);
-    std::vector<std::vector<bool>> visibility;
+    std::vector<std::vector<uchar>> visibility;
     cv::Mat centroid = cv::Mat(cv::Matx31f(
             (float)cv::mean(pattern.row(0)).val[0],
             (float)cv::mean(pattern.row(1)).val[0],
@@ -83,7 +83,7 @@ TEST(multiview_calibration, accuracy) {
         cv::Mat pattern_new = (R * (pattern - centroid * ones) + centroid * ones  + t * ones).t();
 
         std::vector<cv::Mat> img_pts_cams(num_cameras);
-        std::vector<bool> visible(num_cameras, false);
+        std::vector<uchar> visible(num_cameras, (uchar)0);
         int num_visible_patterns = 0;
         for (int c = 0; c < num_cameras; c++) {
             cv::Mat img_pts;
@@ -108,7 +108,7 @@ TEST(multiview_calibration, accuracy) {
                 }
             }
             if (are_all_pts_in_image) {
-                visible[c] = true;
+                visible[c] = 1;
                 num_visible_patterns += 1;
                 img_pts.copyTo(img_pts_cams[c]);
             }
@@ -124,10 +124,10 @@ TEST(multiview_calibration, accuracy) {
                 break;
         }
     }
-    cv::Mat visibility_mat = cv::Mat_<bool>(num_cameras, (int)objPoints.size());
+    cv::Mat visibility_mat = cv::Mat_<uchar>(num_cameras, (int)objPoints.size());
     for (int c = 0; c < num_cameras; c++) {
         for (int f = 0; f < (int)objPoints.size(); f++) {
-            visibility_mat.at<bool>(c, f) = visibility[f][c];
+            visibility_mat.at<uchar>(c, f) = visibility[f][c];
         }
     }
 
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 79847578b4..8a2d7d3935 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -487,9 +487,13 @@ Cv64suf;
 #define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
 #define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
 
-/** Size of each channel item,
-   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
-#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+/** Size of an array/scalar single-channel value, 4 bits per type:
+    CV_8U - 1 byte
+    CV_8S - 1 byte
+    CV_16U - 2 bytes
+    ...
+*/
+#define CV_ELEM_SIZE1(type) ((int)(0x4881228442211ULL >> (CV_MAT_DEPTH(type) * 4)) & 15)
 
 #define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
 
@@ -963,6 +967,41 @@ protected:
 #endif
 };
 
+class bfloat16_t
+{
+public:
+    bfloat16_t() : w(0) {}
+    explicit bfloat16_t(float x)
+    {
+        Cv32suf in;
+        in.f = x;
+        w = (ushort)(in.u >> 16);
+    }
+
+    operator float() const
+    {
+        Cv32suf out;
+        out.u = w << 16;
+        return out.f;
+    }
+
+    static bfloat16_t fromBits(ushort b)
+    {
+        bfloat16_t result;
+        result.w = b;
+        return result;
+    }
+    static bfloat16_t zero()
+    {
+        bfloat16_t result;
+        result.w = (ushort)0;
+        return result;
+    }
+    ushort bits() const { return w; }
+protected:
+    ushort w;
+};
+
 }
 #endif
 
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 0d68078d98..8eeee8bbb6 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -197,9 +197,11 @@ CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double*
 
 CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
 CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+CV_EXPORTS void cvt16bf32f( const bfloat16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16bf( const float* src, bfloat16_t* dst, int len );
 
-CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
-CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len, int cn );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len, int cn );
 
 struct CV_EXPORTS DFT1D
 {
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index 6f0a83d359..ea3364d3c6 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -66,8 +66,8 @@ typedef signed char schar;
 
 #define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
 
-#define CV_CN_MAX     512
-#define CV_CN_SHIFT   3
+#define CV_CN_MAX     128
+#define CV_CN_SHIFT   5
 #define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
 
 #define CV_8U   0
@@ -78,9 +78,17 @@ typedef signed char schar;
 #define CV_32F  5
 #define CV_64F  6
 #define CV_16F  7
+#define CV_16BF 8
+#define CV_Bool 9
+#define CV_64U  10
+#define CV_64S  11
+#define CV_32U  12
+#define CV_DEPTH_CURR_MAX 13
 
 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+#define CV_IS_INT_TYPE(flags)   (((1 << CV_MAT_DEPTH(flags)) & 0x1e1f) != 0)
+#define CV_IS_FLOAT_TYPE(flags) (((1 << CV_MAT_DEPTH(flags)) & 0x1e0) != 0)
 
 #define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
 #define CV_MAKE_TYPE CV_MAKETYPE
@@ -132,6 +140,37 @@ typedef signed char schar;
 #define CV_16FC3 CV_MAKETYPE(CV_16F,3)
 #define CV_16FC4 CV_MAKETYPE(CV_16F,4)
 #define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
+
+#define CV_64SC1 CV_MAKETYPE(CV_64S,1)
+#define CV_64SC2 CV_MAKETYPE(CV_64S,2)
+#define CV_64SC3 CV_MAKETYPE(CV_64S,3)
+#define CV_64SC4 CV_MAKETYPE(CV_64S,4)
+#define CV_64SC(n) CV_MAKETYPE(CV_64S,(n))
+
+#define CV_64UC1 CV_MAKETYPE(CV_64U,1)
+#define CV_64UC2 CV_MAKETYPE(CV_64U,2)
+#define CV_64UC3 CV_MAKETYPE(CV_64U,3)
+#define CV_64UC4 CV_MAKETYPE(CV_64U,4)
+#define CV_64UC(n) CV_MAKETYPE(CV_64U,(n))
+
+#define CV_BoolC1 CV_MAKETYPE(CV_Bool,1)
+#define CV_BoolC2 CV_MAKETYPE(CV_Bool,2)
+#define CV_BoolC3 CV_MAKETYPE(CV_Bool,3)
+#define CV_BoolC4 CV_MAKETYPE(CV_Bool,4)
+#define CV_BoolC(n) CV_MAKETYPE(CV_Bool,(n))
+
+#define CV_32UC1 CV_MAKETYPE(CV_32U,1)
+#define CV_32UC2 CV_MAKETYPE(CV_32U,2)
+#define CV_32UC3 CV_MAKETYPE(CV_32U,3)
+#define CV_32UC4 CV_MAKETYPE(CV_32U,4)
+#define CV_32UC(n) CV_MAKETYPE(CV_32U,(n))
+
+#define CV_16BFC1 CV_MAKETYPE(CV_16BF,1)
+#define CV_16BFC2 CV_MAKETYPE(CV_16BF,2)
+#define CV_16BFC3 CV_MAKETYPE(CV_16BF,3)
+#define CV_16BFC4 CV_MAKETYPE(CV_16BF,4)
+#define CV_16BFC(n) CV_MAKETYPE(CV_16BF,(n))
+
 //! @}
 
 //! @name Comparison operation
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index ee8310b5c5..5c58ba5e5a 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -720,6 +720,22 @@ namespace CV__SIMD_NAMESPACE {
     inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
     //! @}
 
+    #ifndef OPENCV_HAL_HAVE_LOAD_STORE_BFLOAT16
+
+    inline v_float32 vx_load_expand(const bfloat16_t* ptr)
+    {
+        v_uint32 v = vx_load_expand((const ushort*)ptr);
+        return v_reinterpret_as_f32(v_shl<16>(v));
+    }
+
+    inline void v_pack_store(const bfloat16_t* ptr, v_float32 v)
+    {
+        v_int32 iv = v_shr<16>(v_reinterpret_as_s32(v));
+        v_pack_store((short*)ptr, iv);
+    }
+
+    #endif
+
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
@@ -1095,6 +1111,10 @@ namespace CV__SIMD_NAMESPACE {
 #define CV_SIMD 0
 #endif
 
+#if (!defined CV_SIMD_64F) || (!CV_SIMD_64F)
+typedef struct v_float64 { int dummy; } v_float64;
+#endif
+
 #include "simd_utils.impl.hpp"
 
 #ifndef CV_DOXYGEN
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 979b6163d8..6a3ee5b2d7 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -937,6 +937,11 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
     inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
     { return ~(a == b); }
 
+inline v_int64x4 operator > (const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(_mm256_cmpgt_epi64(a.val, b.val)); }
+inline v_int64x4 operator < (const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(_mm256_cmpgt_epi64(b.val, a.val)); }
+
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 
@@ -3162,6 +3167,22 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
 #endif
 }
 
+/*#define OPENCV_HAL_HAVE_PACK_STORE_BFLOAT16 1
+
+inline v_float32x8 v256_load_expand(const bfloat16_t* ptr)
+{
+    __m128i bf = _mm_loadu_si128((const __m128i*)ptr);
+    __m256i f = _mm256_unpacklo_epi16(_mm256_setzero_si256(), _mm256_castsi128_si256(bf));
+    return v_float32x8(_mm256_castsi256_ps(f));
+}
+
+inline void v_pack_store(bfloat16_t* ptr, const v_float32x8& a)
+{
+    __m256i f = _mm256_castps_si256(a.val);
+    f = _mm256_packs_epi32(_mm256_srai_epi32(f, 16), f);
+    _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(f));
+}*/
+
 //
 // end of FP16
 //
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index e9a09d12ae..ab78451a8f 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -3250,6 +3250,8 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
 
 ////// FP16 support ///////
 
+#define OPENCV_HAL_HAVE_PACK_STORE_BFLOAT16 1
+
 inline v_reg<float, simd128_width / sizeof(float)>
 v_load_expand(const float16_t* ptr)
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 6f8973231b..5f8c9afbe3 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1057,44 +1057,61 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+
 #if defined(__aarch64__) || defined(_M_ARM64)
 static inline uint64x2_t vmvnq_u64(uint64x2_t a)
 {
     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
     return veorq_u64(a, vx);
 }
-//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
-//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
-static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
-{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
-static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
-{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
-static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
-{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
-static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
-{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 #else
 static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
 {
-    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val),
+                               vreinterpretq_u32_u64(b.val));
+    uint32x4_t v_eq = vandq_u32(cmp, vrev64q_u32(cmp));
+    return v_uint64x2(vreinterpretq_u64_u32(v_eq));
 }
 static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
 {
-    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
-    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
-    return v_uint64x2(veorq_u64(v_eq, vx));
+    uint64x2_t v_mask = vorrq_u64(vsubq_u64(a.val, b.val), vsubq_u64(b.val, a.val));
+    int64x2_t v_smask = vshrq_n_s64(vreinterpretq_s64_u64(v_mask), 63);
+    return v_uint64x2(vreinterpretq_u64_s64(v_smask));
 }
 static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_s64(a.val),
+                               vreinterpretq_u32_s64(b.val));
+    uint32x4_t v_eq = vandq_u32(cmp, vrev64q_u32(cmp));
+    return v_int64x2(vreinterpretq_s64_u32(v_eq));
 }
 static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
+    int64x2_t v_mask = vorrq_s64(vsubq_s64(a.val, b.val), vsubq_s64(b.val, a.val));
+    int64x2_t v_smask = vshrq_n_s64(v_mask, 63);
+    return v_int64x2(v_smask);
+}
+static inline v_uint64x2 operator > (const v_uint64x2& a, const v_uint64x2& b)
+{
+    int64x2_t v_mask = vreinterpretq_s64_u64(vsubq_u64(b.val, a.val));
+    return v_uint64x2(vreinterpretq_u64_s64(vshrq_n_s64(v_mask, 63)));
+}
+static inline v_uint64x2 operator < (const v_uint64x2& a, const v_uint64x2& b)
+{
+    int64x2_t v_mask = vreinterpretq_s64_u64(vsubq_u64(a.val, b.val));
+    return v_uint64x2(vreinterpretq_u64_s64(vshrq_n_s64(v_mask, 63)));
+}
+static inline v_int64x2 operator > (const v_int64x2& a, const v_int64x2& b)
+{
+    int64x2_t v_mask = vsubq_s64(b.val, a.val);
+    return v_int64x2(vshrq_n_s64(v_mask, 63));
+}
+static inline v_int64x2 operator < (const v_int64x2& a, const v_int64x2& b)
+{
+    int64x2_t v_mask = vsubq_s64(a.val, b.val);
+    return v_int64x2(vshrq_n_s64(v_mask, 63));
 }
 #endif
 #if CV_SIMD128_64F
@@ -1622,7 +1639,7 @@ inline int v_signmask(const v_uint64x2& a)
     const int64x2_t signPosition = {0,1};
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
     uint64_t t0 = vaddvq_u64(v0);
-    return t0;
+    return (int)t0;
 #else // #if CV_NEON_AARCH64
     int64x1_t m0 = vdup_n_s64(0);
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 9d17f71666..4c2e82db1c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1275,6 +1275,14 @@ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 { return ~(a == b); }
 #endif
 
+inline v_int64x2 operator > (const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i s = _mm_srli_epi64(_mm_sub_epi64(b.val, a.val), 63);
+    return v_int64x2(_mm_sub_epi64(_mm_setzero_si128(), s));
+}
+inline v_int64x2 operator < (const v_int64x2& a, const v_int64x2& b)
+{ return b > a; }
+
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index a5f244e8c0..a89fcf5400 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -298,9 +298,9 @@ public:
         DEPTH_MASK_32F = 1 << CV_32F,
         DEPTH_MASK_64F = 1 << CV_64F,
         DEPTH_MASK_16F = 1 << CV_16F,
-        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
+        DEPTH_MASK_ALL = (1 << CV_DEPTH_CURR_MAX)-1,
         DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
-        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
+        DEPTH_MASK_ALL_16F = DEPTH_MASK_ALL,
         DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
     };
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index c9fc1d67a6..2ab9584066 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -666,9 +666,7 @@ bool Mat::isSubmatrix() const
 inline
 size_t Mat::elemSize() const
 {
-    size_t res = dims > 0 ? step.p[dims - 1] : 0;
-    CV_DbgAssert(res != 0);
-    return res;
+    return CV_ELEM_SIZE(flags);
 }
 
 inline
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 68cb4de5bb..76c214b757 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -442,6 +442,12 @@ typedef Vec<int, 4> Vec4i;
 typedef Vec<int, 6> Vec6i;
 typedef Vec<int, 8> Vec8i;
 
+typedef Vec<int64_t, 2> Vec2l;
+typedef Vec<int64_t, 3> Vec3l;
+typedef Vec<int64_t, 4> Vec4l;
+typedef Vec<int64_t, 6> Vec6l;
+typedef Vec<int64_t, 8> Vec8l;
+
 typedef Vec<float, 2> Vec2f;
 typedef Vec<float, 3> Vec3f;
 typedef Vec<float, 4> Vec4f;
diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp
index e0cc965ab6..ff2d893bfc 100644
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -146,9 +146,8 @@ template<> inline unsigned saturate_cast<unsigned>(short v)  { return (unsigned)
 template<> inline unsigned saturate_cast<unsigned>(int v)    { return (unsigned)std::max(v, (int)0); }
 template<> inline unsigned saturate_cast<unsigned>(int64 v)  { return (unsigned)((uint64)v <= (uint64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
 template<> inline unsigned saturate_cast<unsigned>(uint64 v) { return (unsigned)std::min(v, (uint64)UINT_MAX); }
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return static_cast<unsigned>(cvRound(v)); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return static_cast<unsigned>(cvRound(v)); }
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return (unsigned)round(std::max(v, 0.f)); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return (unsigned)round(std::max(v, 0.)); }
 
 template<> inline uint64 saturate_cast<uint64>(schar v)      { return (uint64)std::max(v, (schar)0); }
 template<> inline uint64 saturate_cast<uint64>(short v)      { return (uint64)std::max(v, (short)0); }
@@ -156,9 +155,16 @@ template<> inline uint64 saturate_cast<uint64>(int v)        { return (uint64)st
 template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)std::max(v, (int64)0); }
 
 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
+template<> inline int64 saturate_cast<int64>(float v)        { return (int64)round((double)v); }
+template<> inline int64 saturate_cast<int64>(double v)       { return (int64)round(v); }
+template<> inline uint64 saturate_cast<uint64>(float v)      { return (int64)round((double)std::max(v, 0.f)); }
+template<> inline uint64 saturate_cast<uint64>(double v)     { return (int64)round(std::max(v, 0.)); }
+
 
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+template<typename _Tp> static inline _Tp saturate_cast(bfloat16_t v) { return saturate_cast<_Tp>((float)v); }
+template<typename _Tp> static inline _Tp saturate_cast(bool v) { return saturate_cast<_Tp>(v ? 1 : 0); }
 
 // in theory, we could use a LUT for 8u/8s->16f conversion,
 // but with hardware support for FP32->FP16 conversion the current approach is preferable
@@ -172,6 +178,32 @@ template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16
 template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
 template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
 template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(bfloat16_t v)  { return float16_t((float)v); }
+
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(uchar v)   { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(schar v)   { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(ushort v)  { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(short v)   { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(unsigned v){ return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(int v)     { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(uint64 v)  { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(int64 v)   { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(float v)   { return bfloat16_t(v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(double v)  { return bfloat16_t((float)v); }
+template<> inline bfloat16_t saturate_cast<bfloat16_t>(float16_t v)  { return bfloat16_t((float)v); }
+
+template<> inline bool saturate_cast<bool>(uchar v) { return v != 0; }
+template<> inline bool saturate_cast<bool>(schar v) { return v != 0; }
+template<> inline bool saturate_cast<bool>(ushort v) { return v != 0; }
+template<> inline bool saturate_cast<bool>(short v) { return v != 0; }
+template<> inline bool saturate_cast<bool>(unsigned v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(int v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(float v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(double v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(uint64_t v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(int64_t v){ return v != 0; }
+template<> inline bool saturate_cast<bool>(float16_t v){ return (float)v != 0; }
+template<> inline bool saturate_cast<bool>(bfloat16_t v){ return (float)v != 0; }
 
 //! @}
 
diff --git a/modules/core/include/opencv2/core/traits.hpp b/modules/core/include/opencv2/core/traits.hpp
index 52ab083ca4..18ceb9098d 100644
--- a/modules/core/include/opencv2/core/traits.hpp
+++ b/modules/core/include/opencv2/core/traits.hpp
@@ -134,9 +134,9 @@ public:
     typedef value_type  channel_type;
     typedef value_type  vec_type;
     enum { generic_type = 0,
-           depth        = CV_8U,
+           depth        = CV_Bool,
            channels     = 1,
-           fmt          = (int)'u',
+           fmt          = (int)'b',
            type         = CV_MAKETYPE(depth, channels)
          };
 };
@@ -231,6 +231,51 @@ public:
          };
 };
 
+template<> class DataType<unsigned>
+{
+public:
+    typedef unsigned    value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32U,
+           channels     = 1,
+           fmt          = (int)'n',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<int64_t>
+{
+public:
+    typedef unsigned    value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_64S,
+           channels     = 1,
+           fmt          = (int)'L',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<uint64_t>
+{
+public:
+    typedef unsigned    value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_64U,
+           channels     = 1,
+           fmt          = (int)'U',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
 template<> class DataType<float>
 {
 public:
@@ -276,6 +321,21 @@ public:
          };
 };
 
+template<> class DataType<bfloat16_t>
+{
+public:
+    typedef bfloat16_t  value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16BF,
+           channels     = 1,
+           fmt          = (int)'H',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
 /** @brief A helper class for cv::DataType
 
 The class is specialized for each fundamental numerical data type supported by OpenCV. It provides
@@ -332,6 +392,12 @@ template<> class TypeDepth<CV_32S>
     typedef int value_type;
 };
 
+template<> class TypeDepth<CV_32U>
+{
+    enum { depth = CV_32U };
+    typedef unsigned value_type;
+};
+
 template<> class TypeDepth<CV_32F>
 {
     enum { depth = CV_32F };
@@ -344,12 +410,36 @@ template<> class TypeDepth<CV_64F>
     typedef double value_type;
 };
 
+template<> class TypeDepth<CV_64U>
+{
+    enum { depth = CV_64U };
+    typedef uint64_t value_type;
+};
+
+template<> class TypeDepth<CV_64S>
+{
+    enum { depth = CV_64S };
+    typedef int64_t value_type;
+};
+
 template<> class TypeDepth<CV_16F>
 {
     enum { depth = CV_16F };
     typedef float16_t value_type;
 };
 
+template<> class TypeDepth<CV_16BF>
+{
+    enum { depth = CV_16BF };
+    typedef bfloat16_t value_type;
+};
+
+template<> class TypeDepth<CV_Bool>
+{
+    enum { depth = CV_Bool };
+    typedef bool value_type;
+};
+
 #endif
 
 //! @}
diff --git a/modules/core/misc/java/src/java/core+CvType.java b/modules/core/misc/java/src/java/core+CvType.java
index fcf616fe02..f7abab170e 100644
--- a/modules/core/misc/java/src/java/core+CvType.java
+++ b/modules/core/misc/java/src/java/core+CvType.java
@@ -30,7 +30,7 @@ public final class CvType {
             CV_64FC1 = CV_64FC(1), CV_64FC2 = CV_64FC(2), CV_64FC3 = CV_64FC(3), CV_64FC4 = CV_64FC(4),
             CV_16FC1 = CV_16FC(1), CV_16FC2 = CV_16FC(2), CV_16FC3 = CV_16FC(3), CV_16FC4 = CV_16FC(4);
 
-    private static final int CV_CN_MAX = 512, CV_CN_SHIFT = 3, CV_DEPTH_MAX = (1 << CV_CN_SHIFT);
+    private static final int CV_CN_MAX = 128, CV_CN_SHIFT = 5, CV_DEPTH_MAX = (1 << CV_CN_SHIFT);
 
     public static final int makeType(int depth, int channels) {
         if (channels <= 0 || channels >= CV_CN_MAX) {
diff --git a/modules/core/misc/java/test/CvTypeTest.java b/modules/core/misc/java/test/CvTypeTest.java
index 45ab4d4143..9f13324f19 100644
--- a/modules/core/misc/java/test/CvTypeTest.java
+++ b/modules/core/misc/java/test/CvTypeTest.java
@@ -65,7 +65,7 @@ public class CvTypeTest extends OpenCVTestCase {
     public void testTypeToString() {
         assertEquals("CV_32FC1", CvType.typeToString(CvType.CV_32F));
         assertEquals("CV_32FC3", CvType.typeToString(CvType.CV_32FC3));
-        assertEquals("CV_32FC(128)", CvType.typeToString(CvType.CV_32FC(128)));
+        assertEquals("CV_32FC(127)", CvType.typeToString(CvType.CV_32FC(127)));
     }
 
 }
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 8f7fd20924..67cc051e0b 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -329,7 +329,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
 
 static BinaryFuncC* getMaxTab()
 {
-    static BinaryFuncC maxTab[] =
+    static BinaryFuncC maxTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s),
@@ -343,7 +343,7 @@ static BinaryFuncC* getMaxTab()
 
 static BinaryFuncC* getMinTab()
 {
-    static BinaryFuncC minTab[] =
+    static BinaryFuncC minTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s),
@@ -617,7 +617,10 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
-        tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        BinaryFuncC func = tab[depth1];
+        CV_Assert(func != 0);
+        func(src1.ptr(), src1.step, src2.ptr(), src2.step,
+             dst.ptr(), dst.step, sz.width,   sz.height, usrdata);
         return;
     }
 
@@ -868,7 +871,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 static BinaryFuncC* getAddTab()
 {
-    static BinaryFuncC addTab[] =
+    static BinaryFuncC addTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s),
@@ -882,7 +885,7 @@ static BinaryFuncC* getAddTab()
 
 static BinaryFuncC* getSubTab()
 {
-    static BinaryFuncC subTab[] =
+    static BinaryFuncC subTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s),
@@ -896,7 +899,7 @@ static BinaryFuncC* getSubTab()
 
 static BinaryFuncC* getAbsDiffTab()
 {
-    static BinaryFuncC absDiffTab[] =
+    static BinaryFuncC absDiffTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s),
@@ -949,7 +952,7 @@ namespace cv
 
 static BinaryFuncC* getMulTab()
 {
-    static BinaryFuncC mulTab[] =
+    static BinaryFuncC mulTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u,
         (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f,
@@ -961,7 +964,7 @@ static BinaryFuncC* getMulTab()
 
 static BinaryFuncC* getDivTab()
 {
-    static BinaryFuncC divTab[] =
+    static BinaryFuncC divTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u,
         (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f,
@@ -973,7 +976,7 @@ static BinaryFuncC* getDivTab()
 
 static BinaryFuncC* getRecipTab()
 {
-    static BinaryFuncC recipTab[] =
+    static BinaryFuncC recipTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u,
         (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f,
@@ -1021,7 +1024,7 @@ UMat UMat::mul(InputArray m, double scale) const
 
 static BinaryFuncC* getAddWeightedTab()
 {
-    static BinaryFuncC addWeightedTab[] =
+    static BinaryFuncC addWeightedTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f,
@@ -1052,7 +1055,7 @@ namespace cv
 
 static BinaryFuncC getCmpFunc(int depth)
 {
-    static BinaryFuncC cmpTab[] =
+    static BinaryFuncC cmpTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s),
@@ -1588,7 +1591,7 @@ typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2,
 
 static InRangeFunc getInRangeFunc(int depth)
 {
-    static InRangeFunc inRangeTab[] =
+    static InRangeFunc inRangeTab[CV_DEPTH_MAX] =
     {
         (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
         (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index 06ebfb7678..20e70e5392 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -104,10 +104,6 @@ namespace cv { namespace hal {
 
 #ifdef ARITHM_DEFINITIONS_ONLY
 
-#if !CV_SIMD_64F
-typedef int v_float64; // dummy
-#endif
-
 //=======================================
 // Utility
 //=======================================
diff --git a/modules/core/src/channels.cpp b/modules/core/src/channels.cpp
index efaeb91068..7953212894 100644
--- a/modules/core/src/channels.cpp
+++ b/modules/core/src/channels.cpp
@@ -79,7 +79,7 @@ typedef void (*MixChannelsFunc)( const void** src, const int* sdelta,
 
 static MixChannelsFunc getMixchFunc(int depth)
 {
-    static MixChannelsFunc mixchTab[] =
+    static MixChannelsFunc mixchTab[CV_DEPTH_MAX] =
     {
         mixChannels8u, mixChannels8u, mixChannels16u,
         mixChannels16u, mixChannels32s, mixChannels32s,
diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp
index 345b4624cb..150b91aa35 100644
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@@ -23,117 +23,28 @@ void cvt32f16f(const float* src, float16_t* dst, int len)
     CV_CPU_DISPATCH(cvt32f16f, (src, dst, len),
         CV_CPU_DISPATCH_MODES_ALL);
 }
-void addRNGBias32f(float* arr, const float* scaleBiasPairs, int len)
+void cvt32f16bf(const float* src, bfloat16_t* dst, int len)
 {
     CV_INSTRUMENT_REGION();
-    CV_CPU_DISPATCH(addRNGBias32f, (arr, scaleBiasPairs, len),
+    CV_CPU_DISPATCH(cvt32f16bf, (src, dst, len),
         CV_CPU_DISPATCH_MODES_ALL);
 }
-void addRNGBias64f(double* arr, const double* scaleBiasPairs, int len)
+void addRNGBias32f(float* arr, const float* scaleBiasPairs, int len, int cn)
 {
     CV_INSTRUMENT_REGION();
-    CV_CPU_DISPATCH(addRNGBias64f, (arr, scaleBiasPairs, len),
+    CV_CPU_DISPATCH(addRNGBias32f, (arr, scaleBiasPairs, len, cn),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void addRNGBias64f(double* arr, const double* scaleBiasPairs, int len, int cn)
+{
+    CV_INSTRUMENT_REGION();
+    CV_CPU_DISPATCH(addRNGBias64f, (arr, scaleBiasPairs, len, cn),
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
 } // namespace
 
 
-/* [TODO] Recover IPP calls
-#if defined(HAVE_IPP)
-#define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-
-#define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-#else
-#define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-#define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
-#endif
-
-#define DEF_CVT_FUNC(suffix, stype, dtype) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-
-#define DEF_CPY_FUNC(suffix, stype) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         stype* dst, size_t dstep, Size size, double*) \
-{ \
-    cpy_(src, sstep, dst, dstep, size); \
-}
-
-DEF_CPY_FUNC(8u,     uchar)
-DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
-DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
-DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
-DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
-DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
-DEF_CVT_FUNC(64f8u,  double, uchar)
-
-DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
-DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
-DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
-DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
-DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
-DEF_CVT_FUNC(64f8s,  double, schar)
-
-DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
-DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
-DEF_CPY_FUNC(16u,    ushort)
-DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
-DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
-DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
-DEF_CVT_FUNC(64f16u, double, ushort)
-
-DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
-DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
-DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
-DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
-DEF_CVT_FUNC(32f16s, float, short)
-DEF_CVT_FUNC(64f16s, double, short)
-
-DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
-DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
-DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
-DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
-DEF_CPY_FUNC(32s,    int)
-DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
-DEF_CVT_FUNC(64f32s, double, int)
-
-DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
-DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
-DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
-DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
-DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
-DEF_CVT_FUNC(64f32f, double, float)
-
-DEF_CVT_FUNC(8u64f,  uchar, double)
-DEF_CVT_FUNC(8s64f,  schar, double)
-DEF_CVT_FUNC(16u64f, ushort, double)
-DEF_CVT_FUNC(16s64f, short, double)
-DEF_CVT_FUNC(32s64f, int, double)
-DEF_CVT_FUNC(32f64f, float, double)
-DEF_CPY_FUNC(64s,    int64)
-*/
-
 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 {
     CV_INSTRUMENT_REGION();
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 4b9ddbb413..3aa7dadac9 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -28,12 +28,26 @@ static inline void vx_load_as(const short* ptr, v_float32& a)
 static inline void vx_load_as(const int* ptr, v_float32& a)
 { a = v_cvt_f32(vx_load(ptr)); }
 
+static inline void vx_load_as(const unsigned* ptr, v_float32& a)
+{
+    v_uint32 delta = vx_setall_u32(0x80000000U);
+    v_uint32 ua = vx_load(ptr);
+    v_uint32 mask_a = (ua >= delta) & delta;
+    v_float32 fmask_a = v_cvt_f32(v_reinterpret_as_s32(mask_a)); // 0.f or (float)(-(1 << 31))
+    a = v_cvt_f32(v_reinterpret_as_s32(ua - mask_a));
+    // restore the original values
+    a -= fmask_a; // subtract 0 or a large negative number
+}
+
 static inline void vx_load_as(const float* ptr, v_float32& a)
 { a = vx_load(ptr); }
 
 static inline void vx_load_as(const float16_t* ptr, v_float32& a)
 { a = vx_load_expand(ptr); }
 
+static inline void vx_load_as(const bfloat16_t* ptr, v_float32& a)
+{ a = vx_load_expand(ptr); }
+
 static inline void v_store_as(ushort* ptr, const v_float32& a)
 { v_pack_u_store(ptr, v_round(a)); }
 
@@ -43,12 +57,40 @@ static inline void v_store_as(short* ptr, const v_float32& a)
 static inline void v_store_as(int* ptr, const v_float32& a)
 { v_store(ptr, v_round(a)); }
 
+static inline void v_store_as(unsigned* ptr, const v_float32& a)
+{
+    v_float32 z = vx_setzero_f32();
+    v_store(ptr, v_reinterpret_as_u32(v_round(v_max(a, z))));
+}
+
 static inline void v_store_as(float* ptr, const v_float32& a)
 { v_store(ptr, a); }
 
 static inline void v_store_as(float16_t* ptr, const v_float32& a)
 { v_pack_store(ptr, a); }
 
+static inline void v_store_as(bfloat16_t* ptr, const v_float32& a)
+{ v_pack_store(ptr, a); }
+
+static inline void v_store_as(int64_t* ptr, const v_float32& a)
+{
+    v_int32 ia = v_round(a);
+    v_int64 ia_0, ia_1;
+    v_expand(ia, ia_0, ia_1);
+    v_store(ptr, ia_0);
+    v_store(ptr + v_int64::nlanes, ia_1);
+}
+
+static inline void v_store_as(uint64_t* ptr, const v_float32& a)
+{
+    v_int32 ia = v_round(a);
+    v_uint64 ia_0, ia_1;
+    ia = v_max(ia, vx_setzero_s32());
+    v_expand(v_reinterpret_as_u32(ia), ia_0, ia_1);
+    v_store(ptr, ia_0);
+    v_store(ptr + v_int64::nlanes, ia_1);
+}
+
 static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
 { v_expand(vx_load(ptr), a, b); }
 
@@ -147,6 +189,115 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
     b = v_cvt_f32(ib);
 }
 
+static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
+{
+    const int int64_nlanes = v_int64::nlanes;
+    a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
+    b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
+}
+
+static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
+{
+    v_int64 z = vx_setzero_s64();
+    v_int64 ia = vx_load(ptr), ib = vx_load(ptr + v_int64::nlanes);
+    ia &= (ia > z);
+    ib &= (ib > z);
+    a = v_reinterpret_as_u64(ia);
+    b = v_reinterpret_as_u64(ib);
+}
+
+static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
+{
+    const int nlanes = v_int64::nlanes;
+    v_int64 z = vx_setzero_s64();
+    v_int64 ia0 = vx_load(ptr), ia1 = vx_load(ptr + nlanes);
+    v_int64 ib0 = vx_load(ptr + nlanes*2), ib1 = vx_load(ptr + nlanes*3);
+    ia0 &= (ia0 > z);
+    ia1 &= (ia1 > z);
+    ib0 &= (ib0 > z);
+    ib1 &= (ib1 > z);
+    a = v_pack(v_reinterpret_as_u64(ia0), v_reinterpret_as_u64(ia1));
+    b = v_pack(v_reinterpret_as_u64(ib0), v_reinterpret_as_u64(ib1));
+}
+
+static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32& b)
+{
+    const int nlanes = v_uint64::nlanes;
+    float buf[v_uint64::nlanes*4];
+    for (int i = 0; i < nlanes*4; i++) {
+        buf[i] = (float)ptr[i];
+    }
+    a = vx_load(buf);
+    b = vx_load(buf + nlanes*2);
+}
+
+static inline void vx_load_pair_as(const int64_t* ptr, v_float32& a, v_float32& b)
+{
+    const int nlanes = v_int64::nlanes;
+    float buf[v_int64::nlanes*4];
+    for (int i = 0; i < nlanes*4; i++) {
+        buf[i] = (float)ptr[i];
+    }
+    a = vx_load(buf);
+    b = vx_load(buf + nlanes*2);
+}
+
+static inline void vx_load_pair_as(const bool* ptr, v_float32& a, v_float32& b)
+{
+    v_uint16 z = vx_setzero_u16();
+    v_uint16 uab = vx_load_expand((const uchar*)ptr);
+    uab = v_shr<15>(uab > z);
+    v_int32 ia, ib;
+    v_expand(v_reinterpret_as_s16(uab), ia, ib);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_as(const bool* ptr, v_float32& a)
+{
+    v_uint32 z = vx_setzero_u32();
+    v_uint32 ua = vx_load_expand_q((const uchar*)ptr);
+    ua = v_shr<31>(ua > z);
+    a = v_cvt_f32(v_reinterpret_as_s32(ua));
+}
+
+static inline void vx_load_pair_as(const schar* ptr, v_uint32& a, v_uint32& b)
+{
+    v_int16 ab = v_max(vx_load_expand(ptr), vx_setzero_s16());
+    v_expand(v_reinterpret_as_u16(ab), a, b);
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_uint32& a, v_uint32& b)
+{
+    v_int16 ab = v_max(vx_load(ptr), vx_setzero_s16());
+    v_expand(v_reinterpret_as_u16(ab), a, b);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_uint32& a, v_uint32& b)
+{
+    v_int32 z = vx_setzero_s32();
+    v_int32 ia = v_max(vx_load(ptr), z);
+    v_int32 ib = v_max(vx_load(ptr + v_int32::nlanes), z);
+    a = v_reinterpret_as_u32(ia);
+    b = v_reinterpret_as_u32(ib);
+}
+
+static inline void vx_load_pair_as(const uint64_t* ptr, v_uint32& a, v_uint32& b)
+{
+    const int int64_nlanes = v_int64::nlanes;
+    a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
+    b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
+}
+
+static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
+{
+    const int int64_nlanes = v_int64::nlanes;
+    v_uint32 ua = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
+    v_uint32 ub = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
+    a = v_reinterpret_as_s32(ua);
+    b = v_reinterpret_as_s32(ub);
+}
+
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
 
@@ -156,6 +307,39 @@ static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32
     b = vx_load_expand(ptr + v_float32::nlanes);
 }
 
+static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float32& a, v_float32& b)
+{
+    a = vx_load_expand(ptr);
+    b = vx_load_expand(ptr + v_float32::nlanes);
+}
+
+static inline void vx_load_pair_as(const unsigned* ptr, v_uint32& a, v_uint32& b)
+{
+    a = vx_load(ptr);
+    b = vx_load(ptr + v_uint32::nlanes);
+}
+
+static inline void vx_load_pair_as(const unsigned* ptr, v_int32& a, v_int32& b)
+{
+    a = v_reinterpret_as_s32(vx_load(ptr));
+    b = v_reinterpret_as_s32(vx_load(ptr + v_uint32::nlanes));
+}
+
+static inline void vx_load_pair_as(const unsigned* ptr, v_float32& a, v_float32& b)
+{
+    v_uint32 delta = vx_setall_u32(0x80000000U);
+    v_uint32 ua = vx_load(ptr);
+    v_uint32 ub = vx_load(ptr + v_uint32::nlanes);
+    v_uint32 mask_a = (ua >= delta) & delta, mask_b = (ub >= delta) & delta;
+    v_float32 fmask_a = v_cvt_f32(v_reinterpret_as_s32(mask_a)); // 0.f or (float)(-(1 << 31))
+    v_float32 fmask_b = v_cvt_f32(v_reinterpret_as_s32(mask_b)); // 0.f or (float)(-(1 << 31))
+    a = v_cvt_f32(v_reinterpret_as_s32(ua - mask_a));
+    b = v_cvt_f32(v_reinterpret_as_s32(ub - mask_b));
+    // restore the original values
+    a -= fmask_a; // subtract 0 or a large negative number
+    b -= fmask_b; // subtract 0 or a large negative number
+}
+
 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
 {
     v_store(ptr, v_pack(a, b));
@@ -198,12 +382,33 @@ static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
     v_store(ptr + v_int32::nlanes, b);
 }
 
+static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32& b)
+{
+    v_int64 q0, q1, q2, q3;
+    v_expand(a, q0, q1);
+    v_expand(b, q2, q3);
+    const int nlanes = v_int64::nlanes;
+    v_store(ptr, q0);
+    v_store(ptr + nlanes, q1);
+    v_store(ptr + nlanes*2, q2);
+    v_store(ptr + nlanes*3, q3);
+}
+
 static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
 { v_pack_u_store(ptr, v_pack(v_round(a), v_round(b))); }
 
 static inline void v_store_pair_as(schar* ptr, const v_float32& a, const v_float32& b)
 { v_pack_store(ptr, v_pack(v_round(a), v_round(b))); }
 
+static inline void v_store_pair_as(bool* ptr, const v_float32& a, const v_float32& b)
+{
+    v_float32 z = vx_setzero_f32();
+    v_uint32 ma = v_shr<31>(v_reinterpret_as_u32(a != z));
+    v_uint32 mb = v_shr<31>(v_reinterpret_as_u32(b != z));
+    v_uint16 mab = v_pack(ma, mb);
+    v_pack_store((uchar*)ptr, mab);
+}
+
 static inline void v_store_pair_as(ushort* ptr, const v_float32& a, const v_float32& b)
 { v_store(ptr, v_pack_u(v_round(a), v_round(b))); }
 
@@ -220,14 +425,95 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
 static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
 { v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
 
+static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_float32& b)
+{
+    v_int32 z = vx_setzero_s32();
+    v_int32 ia = v_max(v_round(a), z);
+    v_int32 ib = v_max(v_round(b), z);
+    v_store(ptr, v_reinterpret_as_u32(ia));
+    v_store(ptr + v_int32::nlanes, v_reinterpret_as_u32(ib));
+}
+
+static inline void v_store_pair_as(uchar* ptr, const v_uint32& a, const v_uint32& b)
+{
+    v_pack_store(ptr, v_pack(a, b));
+}
+
+static inline void v_store_pair_as(ushort* ptr, const v_uint32& a, const v_uint32& b)
+{
+    v_store(ptr, v_pack(a, b));
+}
+
+static inline void v_store_pair_as(unsigned* ptr, const v_uint32& a, const v_uint32& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_uint32::nlanes, b);
+}
+
+static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uint32& b)
+{
+    v_uint64 q0, q1, q2, q3;
+    v_expand(a, q0, q1);
+    v_expand(b, q2, q3);
+    const int nlanes = v_uint64::nlanes;
+    v_store(ptr, q0);
+    v_store(ptr + nlanes, q1);
+    v_store(ptr + nlanes*2, q2);
+    v_store(ptr + nlanes*3, q3);
+}
+
+static inline void v_store_pair_as(uint64_t* ptr, const v_uint64& a, const v_uint64& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_uint64::nlanes, b);
+}
+
 #if CV_SIMD_64F
 
+static inline void vx_load_as(const uint64_t* ptr, v_float32& a)
+{
+    v_float64 a_0 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
+    v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + v_uint64::nlanes)));
+    a = v_cvt_f32(a_0, a_1);
+}
+
+static inline void vx_load_as(const int64_t* ptr, v_float32& a)
+{
+    v_float64 a_0 = v_cvt_f64(vx_load(ptr));
+    v_float64 a_1 = v_cvt_f64(vx_load(ptr + v_uint64::nlanes));
+    a = v_cvt_f32(a_0, a_1);
+}
+
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
     v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
     a = v_cvt_f32(v0, v1);
 }
 
+static inline void vx_load_pair_as(const bool* ptr, v_float64& a, v_float64& b)
+{
+    v_uint32 z = vx_setzero_u32();
+    v_uint32 uab = vx_load_expand_q((const uchar*)ptr);
+    uab = v_shr<31>(uab > z);
+    v_float32 fab = v_cvt_f32(v_reinterpret_as_s32(uab));
+    a = v_cvt_f64(fab);
+    b = v_cvt_f64_high(fab);
+}
+
+static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 fab = vx_load_expand(ptr);
+    a = v_cvt_f64(fab);
+    b = v_cvt_f64_high(fab);
+}
+
+static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 fab = vx_load_expand(ptr);
+    a = v_cvt_f64(fab);
+    b = v_cvt_f64_high(fab);
+}
+
 static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 {
     v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
@@ -238,6 +524,13 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
     b = v_combine_low(iv2, iv3);
 }
 
+static inline void vx_load_pair_as(const uint64_t* ptr, v_float64& a, v_float64& b)
+{
+    const int int64_nlanes = v_int64::nlanes;
+    a = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
+    b = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + int64_nlanes)));
+}
+
 static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
 {
     v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
@@ -294,11 +587,20 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b
     b = vx_load(ptr + v_float64::nlanes);
 }
 
-static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+static inline void vx_load_pair_as(const int64_t* ptr, v_float64& a, v_float64& b)
 {
-    v_float32 v0 = vx_load_expand(ptr);
-    a = v_cvt_f64(v0);
-    b = v_cvt_f64_high(v0);
+    a = v_cvt_f64(vx_load(ptr));
+    b = v_cvt_f64(vx_load(ptr + v_float64::nlanes));
+}
+
+static inline void vx_load_pair_as(const unsigned* ptr, v_float64& a, v_float64& b)
+{
+    const int nlanes = v_uint64::nlanes;
+    double buf[v_uint64::nlanes*2];
+    for (int i = 0; i < nlanes*2; i++)
+        buf[i] = (double)ptr[i];
+    a = vx_load(buf);
+    b = vx_load(buf + nlanes);
 }
 
 static inline void v_store_as(double* ptr, const v_float32& a)
@@ -354,6 +656,29 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f
     v_pack_store(ptr, v);
 }
 
+static inline void v_store_pair_as(uint64_t* ptr, const v_float64& a, const v_float64& b)
+{
+    v_float64 z = vx_setzero_f64();
+    v_int64 ia, ib;
+    v_expand(v_round(v_max(a, z), v_max(b, z)), ia, ib);
+    v_store(ptr, v_reinterpret_as_u64(ia));
+    v_store(ptr + v_int64::nlanes, v_reinterpret_as_u64(ib));
+}
+
+static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_float64& b)
+{
+    v_int64 ia, ib;
+    v_expand(v_round(a, b), ia, ib);
+    v_store(ptr, ia);
+    v_store(ptr + v_int64::nlanes, ib);
+}
+
+static inline void v_store_pair_as(unsigned* ptr, const v_float64& a, const v_float64& b)
+{
+    v_int32 iab = v_max(v_round(a, b), vx_setzero_s32());
+    v_store(ptr, v_reinterpret_as_u32(iab));
+}
+
 #else
 
 static inline void vx_load_as(const double* ptr, v_float32& a)
@@ -366,6 +691,26 @@ static inline void vx_load_as(const double* ptr, v_float32& a)
     a = vx_load(buf);
 }
 
+static inline void vx_load_as(const uint64_t* ptr, v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ; i++ )
+        buf[i] = saturate_cast<float>(ptr[i]);
+    a = vx_load(buf);
+}
+
+static inline void vx_load_as(const int64_t* ptr, v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ; i++ )
+        buf[i] = saturate_cast<float>(ptr[i]);
+    a = vx_load(buf);
+}
+
 template<typename _Tdvec>
 static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
 {
diff --git a/modules/core/src/convert.simd.hpp b/modules/core/src/convert.simd.hpp
index 5154041b6d..c776918846 100644
--- a/modules/core/src/convert.simd.hpp
+++ b/modules/core/src/convert.simd.hpp
@@ -16,8 +16,10 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 void cvt16f32f(const float16_t* src, float* dst, int len);
 void cvt32f16f(const float* src, float16_t* dst, int len);
-void addRNGBias32f(float* arr, const float* scaleBiasPairs, int len);
-void addRNGBias64f(double* arr, const double* scaleBiasPairs, int len);
+void cvt16bf32f(const bfloat16_t* src, float* dst, int len);
+void cvt32f16bf(const float* src, bfloat16_t* dst, int len);
+void addRNGBias32f(float* arr, const float* scaleBiasPairs, int len, int cn);
+void addRNGBias64f(double* arr, const double* scaleBiasPairs, int len, int cn);
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 } // namespace cv::hal
@@ -77,20 +79,63 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
         dst[j] = float16_t(src[j]);
 }
 
-void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
+void cvt32f16bf( const float* src, bfloat16_t* dst, int len )
 {
     CV_INSTRUMENT_REGION();
-    // the loop is simple enough, so we let the compiler to vectorize it
-    for( int i = 0; i < len; i++ )
-        arr[i] += scaleBiasPairs[i*2 + 1];
+    int j = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; j < len; j += VECSZ )
+    {
+        if( j > len - VECSZ )
+        {
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
+        }
+        v_pack_store(dst + j, vx_load(src + j));
+    }
+#endif
+    for( ; j < len; j++ )
+        dst[j] = bfloat16_t(src[j]);
 }
 
-void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
+void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-    // the loop is simple enough, so we let the compiler to vectorize it
-    for( int i = 0; i < len; i++ )
-        arr[i] += scaleBiasPairs[i*2 + 1];
+    if (cn == 1) {
+        float bias = scaleBiasPairs[1];
+        for( int i = 0; i < len; i++ ) {
+            arr[i] += bias;
+        }
+    } else {
+        int k = 0;
+        len *= cn;
+        cn--;
+        for( int i = 0; i < len; i++ ) {
+            arr[i] += scaleBiasPairs[k*2 + 1];
+            k = (k + 1) & ((k >= cn) - 1);
+        }
+    }
+}
+
+void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len, int cn )
+{
+    CV_INSTRUMENT_REGION();
+    if (cn == 1) {
+        double bias = scaleBiasPairs[1];
+        for( int i = 0; i < len; i++ ) {
+            arr[i] += bias;
+        }
+    } else {
+        int k = 0;
+        len *= cn;
+        cn--;
+        for( int i = 0; i < len; i++ ) {
+            arr[i] += scaleBiasPairs[k*2 + 1];
+            k = (k + 1) & ((k >= cn) - 1);
+        }
+    }
 }
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
@@ -128,6 +173,35 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     }
 }
 
+template<typename _Ts, typename _Td, typename dummy> static inline void
+cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD_64F
+        const int VECSZ = v_float64::nlanes*2;
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float64 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]);
+    }
+}
+
 // in order to reduce the code size, for (16f <-> ...) conversions
 // we add a conversion function without loop unrolling
 template<typename _Ts, typename _Td, typename _Twvec> static inline void
@@ -180,25 +254,102 @@ static void cvt##suffix(const uchar* src_, size_t sstep, const uchar*, size_t, \
     cvtfunc<_Ts, _Td, _Twvec>(src, sstep, dst, dstep, size); \
 }
 
+#define DEF_CVT2BOOL_FUNC(suffix, _Ts, shift) \
+static void cvt##suffix(const uchar* src_, size_t sstep, const uchar*, size_t, \
+                        uchar* dst, size_t dstep, Size size, void*) \
+{ \
+    CV_INSTRUMENT_REGION(); \
+    const _Ts* src = (const _Ts*)src_; \
+    sstep /= sizeof(src[0]); \
+    \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { \
+        for ( int j = 0; j < size.width; j++ ) \
+            dst[j] = (src[j]<<shift) != 0; \
+    } \
+}
+
+#define DEF_CVTBOOL2_FUNC(suffix, _Td, scale) \
+static void cvt##suffix(const uchar* src, size_t sstep, const uchar*, size_t, \
+                        uchar* dst_, size_t dstep, Size size, void*) \
+{ \
+    CV_INSTRUMENT_REGION(); \
+    _Td* dst = (_Td*)dst_; \
+    dstep /= sizeof(dst[0]); \
+    \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { \
+        for ( int j = 0; j < size.width; j++ ) \
+            dst[j] = (_Td)((src[j] != 0)*scale); \
+    } \
+}
+
+#define DEF_CVT_SCALAR_FUNC(suffix, _Ts, _Td) \
+static void cvt##suffix(const uchar* src_, size_t sstep, const uchar*, size_t, \
+                        uchar* dst_, size_t dstep, Size size, void*) \
+{ \
+    CV_INSTRUMENT_REGION(); \
+    const _Ts* src = (const _Ts*)src_; \
+    _Td* dst = (_Td*)dst_; \
+    sstep /= sizeof(src[0]); \
+    dstep /= sizeof(dst[0]); \
+    \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { \
+        for ( int j = 0; j < size.width; j++ ) \
+            dst[j] = saturate_cast<_Td>(src[j]); \
+    } \
+}
+
+#define DEF_CVT_SCALAR_FUNC_S2U(suffix, _Ts, _Td, _Tw) \
+static void cvt##suffix(const uchar* src_, size_t sstep, const uchar*, size_t, \
+                        uchar* dst_, size_t dstep, Size size, void*) \
+{ \
+    CV_INSTRUMENT_REGION(); \
+    const _Ts* src = (const _Ts*)src_; \
+    _Td* dst = (_Td*)dst_; \
+    sstep /= sizeof(src[0]); \
+    dstep /= sizeof(dst[0]); \
+    \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { \
+        for ( int j = 0; j < size.width; j++ ) \
+            dst[j] = saturate_cast<_Td>(std::max((_Tw)src[j], (_Tw)0)); \
+    } \
+}
+
 ////////////////////// 8u -> ... ////////////////////////
 
 DEF_CVT_FUNC(8u8s,  cvt_,  uchar, schar,    v_int16)
-DEF_CVT_FUNC(8u16u, cvt_,  uchar, ushort,   v_uint16)
 DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
+DEF_CVT_SCALAR_FUNC(8u64s, uchar, int64_t)
 DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+DEF_CVT_FUNC(8u16bf, cvt1_, uchar, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(8u8b, uchar, 0)
 
 ////////////////////// 8s -> ... ////////////////////////
 
 DEF_CVT_FUNC(8s8u,  cvt_,  schar, uchar,    v_int16)
 DEF_CVT_FUNC(8s16u, cvt_,  schar, ushort,   v_uint16)
 DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
+DEF_CVT_FUNC(8s32u, cvt_,  schar, unsigned, v_uint32)
 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
+DEF_CVT_FUNC(8s64u, cvt_,  schar, uint64_t, v_uint32)
+DEF_CVT_FUNC(8s64s, cvt_,  schar, int64_t,  v_int32)
 DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+DEF_CVT_FUNC(8s16bf, cvt1_, schar, bfloat16_t, v_float32)
+
+////////////////////// 8b -> ... ////////////////////////
+
+DEF_CVTBOOL2_FUNC(8b8u,  uchar, 1)
+DEF_CVTBOOL2_FUNC(8b16s, short, 1)
+DEF_CVTBOOL2_FUNC(8b32s, int, 1)
+DEF_CVTBOOL2_FUNC(8b32f, float, 1)
+DEF_CVTBOOL2_FUNC(8b64f, double, 1)
+DEF_CVTBOOL2_FUNC(8b64s, int64_t, 1)
+DEF_CVTBOOL2_FUNC(8b16f, uint16_t, 0x3c00) // float16_t(1.0f)
+DEF_CVTBOOL2_FUNC(8b16bf, uint16_t, 0x3f80) // bfloat16_t(1.0f)
 
 ////////////////////// 16u -> ... ////////////////////////
 
@@ -208,17 +359,37 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
+DEF_CVT_SCALAR_FUNC(16u64s, ushort, int64_t)
 DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+DEF_CVT_FUNC(16u16bf, cvt1_, ushort, bfloat16_t, v_float32)
 
 ////////////////////// 16s -> ... ////////////////////////
 
 DEF_CVT_FUNC(16s8u,  cvt_, short, uchar,  v_int16)
 DEF_CVT_FUNC(16s8s,  cvt_, short, schar,  v_int16)
 DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
+DEF_CVT_FUNC(16s32u, cvt_, short, unsigned, v_uint32)
 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
+DEF_CVT_FUNC(16s64u, cvt_, short, uint64_t, v_uint32)
+DEF_CVT_FUNC(16s64s, cvt_, short, int64_t, v_int32)
 DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+DEF_CVT_FUNC(16s16bf, cvt1_, short, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(16s8b, short, 0)
+
+////////////////////// 32u -> ... ////////////////////////
+
+DEF_CVT_FUNC(32u8u,  cvt_, unsigned, uchar,  v_uint32)
+DEF_CVT_FUNC(32u8s,  cvt_, unsigned, schar,  v_int32)
+DEF_CVT_FUNC(32u16u, cvt_, unsigned, ushort, v_uint32)
+DEF_CVT_FUNC(32u16s, cvt_, unsigned, short,  v_int32)
+DEF_CVT_SCALAR_FUNC(32u32s, unsigned, int)
+DEF_CVT_FUNC(32u32f, cvt_, unsigned, float,  v_float32)
+DEF_CVT_FUNC(32u64f, cvt_, unsigned, double, v_float32)
+DEF_CVT_SCALAR_FUNC(32u64s, unsigned, int64_t)
+DEF_CVT_FUNC(32u16f, cvt1_, unsigned, float16_t, v_float32)
+DEF_CVT_FUNC(32u16bf, cvt1_, int, bfloat16_t, v_float32)
 
 ////////////////////// 32s -> ... ////////////////////////
 
@@ -226,9 +397,14 @@ DEF_CVT_FUNC(32s8u,  cvt_, int, uchar,  v_int32)
 DEF_CVT_FUNC(32s8s,  cvt_, int, schar,  v_int32)
 DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
+DEF_CVT_FUNC(32s32u, cvt_, int, unsigned, v_uint32)
 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
+DEF_CVT_FUNC(32s64u, cvt_, int, uint64_t, v_uint32)
+DEF_CVT_FUNC(32s64s, cvt_, int, int64_t, v_int32)
 DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+DEF_CVT_FUNC(32s16bf, cvt1_, int, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(32s8b, int, 0)
 
 ////////////////////// 32f -> ... ////////////////////////
 
@@ -236,9 +412,14 @@ DEF_CVT_FUNC(32f8u,  cvt_, float, uchar,  v_float32)
 DEF_CVT_FUNC(32f8s,  cvt_, float, schar,  v_float32)
 DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32)
 DEF_CVT_FUNC(32f16s, cvt_, float, short,  v_float32)
+DEF_CVT_FUNC(32f32u, cvt_, float, unsigned, v_float32)
 DEF_CVT_FUNC(32f32s, cvt_, float, int,    v_float32)
 DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32)
+DEF_CVT_FUNC(32f64u, cvt_64f, float, uint64_t, v_float64)
+DEF_CVT_FUNC(32f64s, cvt_64f, float, int64_t, v_float64)
 DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32)
+DEF_CVT_FUNC(32f16bf, cvt1_,float, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(32f8b, int, 1)
 
 ////////////////////// 64f -> ... ////////////////////////
 
@@ -246,9 +427,14 @@ DEF_CVT_FUNC(64f8u,  cvt_, double, uchar,  v_int32)
 DEF_CVT_FUNC(64f8s,  cvt_, double, schar,  v_int32)
 DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
+DEF_CVT_FUNC(64f32u, cvt_64f, double, unsigned, v_float32)
 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
+DEF_CVT_FUNC(64f64u, cvt_64f, double, uint64_t, v_float64)
+DEF_CVT_FUNC(64f64s, cvt_64f, double, int64_t, v_float32)
 DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+DEF_CVT_FUNC(64f16bf, cvt1_,double, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(64f8b, int64_t, 1)
 
 ////////////////////// 16f -> ... ////////////////////////
 
@@ -256,9 +442,56 @@ DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
 DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
 DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
 DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+DEF_CVT_FUNC(16f32u, cvt1_, float16_t, unsigned, v_float32)
 DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
 DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
 DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+DEF_CVT_FUNC(16f64u, cvt1_, float16_t, uint64_t, v_float32)
+DEF_CVT_FUNC(16f64s, cvt1_, float16_t, int64_t, v_float32)
+DEF_CVT_FUNC(16f16bf, cvt1_, float16_t, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(16f8b, short, 1)
+
+////////////////////// 16bf -> ... ////////////////////////
+
+DEF_CVT_FUNC(16bf8u,  cvt_,  bfloat16_t, uchar,  v_float32)
+DEF_CVT_FUNC(16bf8s,  cvt_,  bfloat16_t, schar,  v_float32)
+DEF_CVT_FUNC(16bf16u, cvt1_, bfloat16_t, ushort, v_float32)
+DEF_CVT_FUNC(16bf16s, cvt1_, bfloat16_t, short,  v_float32)
+DEF_CVT_FUNC(16bf32u, cvt1_, bfloat16_t, unsigned, v_float32)
+DEF_CVT_FUNC(16bf32s, cvt1_, bfloat16_t, int,    v_float32)
+DEF_CVT_FUNC(16bf32f, cvt1_, bfloat16_t, float,  v_float32)
+DEF_CVT_FUNC(16bf64f, cvt1_, bfloat16_t, double, v_float32)
+DEF_CVT_FUNC(16bf64u, cvt1_, bfloat16_t, uint64_t, v_float32)
+DEF_CVT_FUNC(16bf64s, cvt1_, bfloat16_t, int64_t, v_float32)
+DEF_CVT_FUNC(16bf16f, cvt1_, bfloat16_t, float16_t, v_float32)
+
+////////////////////// 64s -> ... ////////////////////////
+
+DEF_CVT_FUNC(64s8u,  cvt_, int64_t, uchar,  v_int32)
+DEF_CVT_FUNC(64s8s,  cvt_, int64_t, schar,  v_int32)
+DEF_CVT_FUNC(64s16u, cvt_, int64_t, ushort, v_int32)
+DEF_CVT_FUNC(64s16s, cvt_, int64_t, short,  v_int32)
+DEF_CVT_FUNC(64s32u, cvt_, int64_t, unsigned, v_uint32)
+DEF_CVT_FUNC(64s32s, cvt_, int64_t, int,    v_int32)
+DEF_CVT_FUNC(64s32f, cvt_64f, int64_t, float,  v_float32)
+DEF_CVT_FUNC(64s64f, cvt_64f, int64_t, double,  v_float64)
+DEF_CVT_FUNC(64s64u, cvt_, int64_t, uint64_t, v_uint64)
+DEF_CVT_FUNC(64s16f, cvt1_,int64_t, float16_t, v_float32)
+DEF_CVT_FUNC(64s16bf, cvt1_, int64_t, bfloat16_t, v_float32)
+DEF_CVT2BOOL_FUNC(64s8b, int64_t, 0)
+
+////////////////////// 64u -> ... ////////////////////////
+
+DEF_CVT_FUNC(64u8u,  cvt_, uint64_t, uchar,  v_int32)
+DEF_CVT_FUNC(64u8s,  cvt_, uint64_t, schar,  v_int32)
+DEF_CVT_FUNC(64u16u, cvt_, uint64_t, ushort, v_int32)
+DEF_CVT_FUNC(64u16s, cvt_, uint64_t, short,  v_int32)
+DEF_CVT_FUNC(64u32u, cvt_, uint64_t, unsigned, v_uint32)
+DEF_CVT_FUNC(64u32s, cvt_, uint64_t, int,   v_int32)
+DEF_CVT_FUNC(64u32f, cvt_64f, uint64_t, float,  v_float64)
+DEF_CVT_FUNC(64u64f, cvt_64f, uint64_t, double,  v_float64)
+DEF_CVT_FUNC(64u16f, cvt1_,uint64_t, float16_t, v_float32)
+DEF_CVT_FUNC(64u16bf, cvt1_, uint64_t, bfloat16_t, v_float32)
 
 ///////////// "conversion" w/o conversion ///////////////
 
@@ -274,147 +507,210 @@ static void cvt32s(const uchar* src, size_t sstep, const uchar*, size_t, uchar*
 static void cvt64s(const uchar* src, size_t sstep, const uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
 { CV_INSTRUMENT_REGION(); cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 8); }
 
-
-/* [TODO] Recover IPP calls
-#if defined(HAVE_IPP)
-#define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-
-#define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-#else
-#define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-#define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
-#endif
-
-#define DEF_CVT_FUNC(suffix, stype, dtype) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double*) \
-{ \
-    cvt_(src, sstep, dst, dstep, size); \
-}
-
-#define DEF_CPY_FUNC(suffix, stype) \
-static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         stype* dst, size_t dstep, Size size, double*) \
-{ \
-    cpy_(src, sstep, dst, dstep, size); \
-}
-
-DEF_CPY_FUNC(8u,     uchar)
-DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
-DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
-DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
-DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
-DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
-DEF_CVT_FUNC(64f8u,  double, uchar)
-
-DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
-DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
-DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
-DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
-DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
-DEF_CVT_FUNC(64f8s,  double, schar)
-
-DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
-DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
-DEF_CPY_FUNC(16u,    ushort)
-DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
-DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
-DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
-DEF_CVT_FUNC(64f16u, double, ushort)
-
-DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
-DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
-DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
-DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
-DEF_CVT_FUNC(32f16s, float, short)
-DEF_CVT_FUNC(64f16s, double, short)
-
-DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
-DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
-DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
-DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
-DEF_CPY_FUNC(32s,    int)
-DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
-DEF_CVT_FUNC(64f32s, double, int)
-
-DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
-DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
-DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
-DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
-DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
-DEF_CVT_FUNC(64f32f, double, float)
-
-DEF_CVT_FUNC(8u64f,  uchar, double)
-DEF_CVT_FUNC(8s64f,  schar, double)
-DEF_CVT_FUNC(16u64f, ushort, double)
-DEF_CVT_FUNC(16s64f, short, double)
-DEF_CVT_FUNC(32s64f, int, double)
-DEF_CVT_FUNC(32f64f, float, double)
-DEF_CPY_FUNC(64s,    int64)
-*/
-
-BinaryFunc getConvertFunc(int sdepth, int ddepth)
+BinaryFunc getConvertFunc(int sdepth_, int ddepth_)
 {
-    static BinaryFunc cvtTab[][8] =
-    {
-        {
-            (cvt8u), (cvt8s8u), (cvt16u8u),
-            (cvt16s8u), (cvt32s8u), (cvt32f8u),
-            (cvt64f8u), (cvt16f8u)
-        },
-        {
-            (cvt8u8s), cvt8u, (cvt16u8s),
-            (cvt16s8s), (cvt32s8s), (cvt32f8s),
-            (cvt64f8s), (cvt16f8s)
-        },
-        {
-            (cvt8u16u), (cvt8s16u), cvt16u,
-            (cvt16s16u), (cvt32s16u), (cvt32f16u),
-            (cvt64f16u), (cvt16f16u)
-        },
-        {
-            (cvt8u16s), (cvt8s16s), (cvt16u16s),
-            cvt16u, (cvt32s16s), (cvt32f16s),
-            (cvt64f16s), (cvt16f16s)
-        },
-        {
-            (cvt8u32s), (cvt8s32s), (cvt16u32s),
-            (cvt16s32s), cvt32s, (cvt32f32s),
-            (cvt64f32s), (cvt16f32s)
-        },
-        {
-            (cvt8u32f), (cvt8s32f), (cvt16u32f),
-            (cvt16s32f), (cvt32s32f), cvt32s,
-            (cvt64f32f), (cvt16f32f)
-        },
-        {
-            (cvt8u64f), (cvt8s64f), (cvt16u64f),
-            (cvt16s64f), (cvt32s64f), (cvt32f64f),
-            (cvt64s), (cvt16f64f)
-        },
-        {
-            (cvt8u16f), (cvt8s16f), (cvt16u16f), (cvt16s16f),
-            (cvt32s16f), (cvt32f16f), (cvt64f16f), (cvt16u)
-        }
-    };
-    return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
+    int sdepth = CV_MAT_DEPTH(sdepth_);
+    int ddepth = CV_MAT_DEPTH(ddepth_);
+    BinaryFunc func =
+        ddepth == CV_8U ? (
+            sdepth == CV_8U ? cvt8u :
+            sdepth == CV_8S ? cvt8s8u :
+            sdepth == CV_16U ? cvt16u8u :
+            sdepth == CV_16S ? cvt16s8u :
+            sdepth == CV_32U ? cvt32u8u :
+            sdepth == CV_32S ? cvt32s8u :
+            sdepth == CV_32F ? cvt32f8u :
+            sdepth == CV_64F ? cvt64f8u :
+            sdepth == CV_16F ? cvt16f8u :
+            sdepth == CV_16BF ? cvt16bf8u :
+            sdepth == CV_Bool ? cvt8b8u :
+            sdepth == CV_64U ? cvt64u8u :
+            sdepth == CV_64S ? cvt64s8u :
+            0) :
+        ddepth == CV_8S ? (
+            sdepth == CV_8U ? cvt8u8s :
+            sdepth == CV_8S ? cvt8u :
+            sdepth == CV_16U ? cvt16u8s :
+            sdepth == CV_16S ? cvt16s8s :
+            sdepth == CV_32U ? cvt32u8s :
+            sdepth == CV_32S ? cvt32s8s :
+            sdepth == CV_32F ? cvt32f8s :
+            sdepth == CV_64F ? cvt64f8s :
+            sdepth == CV_16F ? cvt16f8s :
+            sdepth == CV_16BF ? cvt16bf8s :
+            sdepth == CV_Bool ? cvt8b8u :
+            sdepth == CV_64U ? cvt64u8s :
+            sdepth == CV_64S ? cvt64s8s :
+            0) :
+        ddepth == CV_16U ? (
+            sdepth == CV_8U ? cvt8u16s : // same as cvt8u16u
+            sdepth == CV_8S ? cvt8s16u :
+            sdepth == CV_16U ? cvt16u :
+            sdepth == CV_16S ? cvt16s16u :
+            sdepth == CV_32U ? cvt32u16u :
+            sdepth == CV_32S ? cvt32s16u :
+            sdepth == CV_32F ? cvt32f16u :
+            sdepth == CV_64F ? cvt64f16u :
+            sdepth == CV_16F ? cvt16f16u :
+            sdepth == CV_16BF ? cvt16bf16u :
+            sdepth == CV_Bool ? cvt8b16s :
+            sdepth == CV_64U ? cvt64u16u :
+            sdepth == CV_64S ? cvt64s16u :
+            0) :
+        ddepth == CV_16S ? (
+            sdepth == CV_8U ? cvt8u16s :
+            sdepth == CV_8S ? cvt8s16s :
+            sdepth == CV_16U ? cvt16u16s :
+            sdepth == CV_16S ? cvt16u :
+            sdepth == CV_32U ? cvt32u16s :
+            sdepth == CV_32S ? cvt32s16s :
+            sdepth == CV_32F ? cvt32f16s :
+            sdepth == CV_64F ? cvt64f16s :
+            sdepth == CV_16F ? cvt16f16s :
+            sdepth == CV_16BF ? cvt16bf16s :
+            sdepth == CV_Bool ? cvt8b16s :
+            sdepth == CV_64U ? cvt64u16s :
+            sdepth == CV_64S ? cvt64s16s :
+            0) :
+        ddepth == CV_32U ? (
+            sdepth == CV_8U ? cvt8u32s : // same as cvt8u32u
+            sdepth == CV_8S ? cvt8s32u :
+            sdepth == CV_16U ? cvt16u32s : // same as cvt16u32u
+            sdepth == CV_16S ? cvt16s32u :
+            sdepth == CV_32U ? cvt32s :
+            sdepth == CV_32S ? cvt32s32u :
+            sdepth == CV_32F ? cvt32f32u :
+            sdepth == CV_64F ? cvt64f32u :
+            sdepth == CV_16F ? cvt16f32u :
+            sdepth == CV_16BF ? cvt16bf32u :
+            sdepth == CV_Bool ? cvt8b32s :
+            sdepth == CV_64U ? cvt64u32u :
+            sdepth == CV_64S ? cvt64s32u :
+
+            0) :
+        ddepth == CV_32S ? (
+            sdepth == CV_8U ? cvt8u32s :
+            sdepth == CV_8S ? cvt8s32s :
+            sdepth == CV_16U ? cvt16u32s :
+            sdepth == CV_16S ? cvt16s32s :
+            sdepth == CV_32U ? cvt32u32s :
+            sdepth == CV_32S ? cvt32s :
+            sdepth == CV_32F ? cvt32f32s :
+            sdepth == CV_64F ? cvt64f32s :
+            sdepth == CV_16F ? cvt16f32s :
+            sdepth == CV_16BF ? cvt16bf32s :
+            sdepth == CV_Bool ? cvt8b32s :
+            sdepth == CV_64U ? cvt64u32s :
+            sdepth == CV_64S ? cvt64s32s :
+            0) :
+        ddepth == CV_32F ? (
+            sdepth == CV_8U ? cvt8u32f :
+            sdepth == CV_8S ? cvt8s32f :
+            sdepth == CV_16U ? cvt16u32f :
+            sdepth == CV_16S ? cvt16s32f :
+            sdepth == CV_32U ? cvt32u32f :
+            sdepth == CV_32S ? cvt32s32f :
+            sdepth == CV_32F ? cvt32s :
+            sdepth == CV_64F ? cvt64f32f :
+            sdepth == CV_16F ? cvt16f32f :
+            sdepth == CV_16BF ? cvt16bf32f :
+            sdepth == CV_Bool ? cvt8b32f :
+            sdepth == CV_64U ? cvt64u32f :
+            sdepth == CV_64S ? cvt64s32f :
+            0) :
+        ddepth == CV_64F ? (
+            sdepth == CV_8U ? cvt8u64f :
+            sdepth == CV_8S ? cvt8s64f :
+            sdepth == CV_16U ? cvt16u64f :
+            sdepth == CV_16S ? cvt16s64f :
+            sdepth == CV_32U ? cvt32u64f :
+            sdepth == CV_32S ? cvt32s64f :
+            sdepth == CV_32F ? cvt32f64f :
+            sdepth == CV_64F ? cvt64s :
+            sdepth == CV_16F ? cvt16f64f :
+            sdepth == CV_16BF ? cvt16bf64f :
+            sdepth == CV_Bool ? cvt8b64f :
+            sdepth == CV_64U ? cvt64u64f :
+            sdepth == CV_64S ? cvt64s64f :
+            0) :
+        ddepth == CV_16F ? (
+            sdepth == CV_8U ? cvt8u16f :
+            sdepth == CV_8S ? cvt8s16f :
+            sdepth == CV_16U ? cvt16u16f :
+            sdepth == CV_16S ? cvt16s16f :
+            sdepth == CV_32U ? cvt32u16f :
+            sdepth == CV_32S ? cvt32s16f :
+            sdepth == CV_32F ? cvt32f16f :
+            sdepth == CV_64F ? cvt64f16f :
+            sdepth == CV_16F ? cvt16u :
+            sdepth == CV_16BF ? cvt16bf16f :
+            sdepth == CV_Bool ? cvt8b16f :
+            sdepth == CV_64U ? cvt64u16f :
+            sdepth == CV_64S ? cvt64s16f :
+            0) :
+        ddepth == CV_16BF ? (
+            sdepth == CV_8U ? cvt8u16bf :
+            sdepth == CV_8S ? cvt8s16bf :
+            sdepth == CV_16U ? cvt16u16bf :
+            sdepth == CV_16S ? cvt16s16bf :
+            sdepth == CV_32U ? cvt32u16bf :
+            sdepth == CV_32S ? cvt32s16bf :
+            sdepth == CV_32F ? cvt32f16bf :
+            sdepth == CV_64F ? cvt64f16bf :
+            sdepth == CV_16F ? cvt16f16bf :
+            sdepth == CV_16BF ? cvt16u :
+            sdepth == CV_Bool ? cvt8b16bf :
+            sdepth == CV_64U ? cvt64u16bf :
+            sdepth == CV_64S ? cvt64s16bf :
+            0) :
+        ddepth == CV_Bool ? (
+            sdepth == CV_8U ? cvt8u8b :
+            sdepth == CV_8S ? cvt8u8b :
+            sdepth == CV_16U ? cvt16s8b :
+            sdepth == CV_16S ? cvt16s8b :
+            sdepth == CV_32U ? cvt32s8b :
+            sdepth == CV_32S ? cvt32s8b :
+            sdepth == CV_32F ? cvt32f8b :
+            sdepth == CV_64F ? cvt64f8b :
+            sdepth == CV_16F ? cvt16f8b :
+            sdepth == CV_16BF ? cvt16f8b : // same as cvt16f8b
+            sdepth == CV_Bool ? cvt8u :
+            sdepth == CV_64U ? cvt64s8b :
+            sdepth == CV_64S ? cvt64s8b :
+            0) :
+        ddepth == CV_64U ? (
+            sdepth == CV_8U ? cvt8u64s : // same as cvt8u64u
+            sdepth == CV_8S ? cvt8s64u :
+            sdepth == CV_16U ? cvt16u64s : // same as cvt16u64u
+            sdepth == CV_16S ? cvt16s64u :
+            sdepth == CV_32U ? cvt32u64s : // same as cvt32u64u
+            sdepth == CV_32S ? cvt32s64u :
+            sdepth == CV_32F ? cvt32f64u :
+            sdepth == CV_64F ? cvt64f64u :
+            sdepth == CV_16F ? cvt16f64u :
+            sdepth == CV_16BF ? cvt16bf64u :
+            sdepth == CV_Bool ? cvt8b64s :
+            sdepth == CV_64U ? cvt64s :
+            sdepth == CV_64S ? cvt64s64u :
+            0) :
+        ddepth == CV_64S ? (
+            sdepth == CV_8U ? cvt8u64s :
+            sdepth == CV_8S ? cvt8s64s :
+            sdepth == CV_16U ? cvt16u64s :
+            sdepth == CV_16S ? cvt16s64s :
+            sdepth == CV_32U ? cvt32u64s :
+            sdepth == CV_32S ? cvt32s64s :
+            sdepth == CV_32F ? cvt32f64s :
+            sdepth == CV_64F ? cvt64f64s :
+            sdepth == CV_16F ? cvt16f64s :
+            sdepth == CV_16BF ? cvt16bf64s :
+            sdepth == CV_Bool ? cvt8b64s :
+            sdepth == CV_64U ? cvt64s :
+            sdepth == CV_64S ? cvt64s :
+            0) :
+        0;
+    CV_Assert(func != 0);
+    return func;
 }
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
diff --git a/modules/core/src/convert_scale.simd.hpp b/modules/core/src/convert_scale.simd.hpp
index 2c6d55462b..f1ee7635e7 100644
--- a/modules/core/src/convert_scale.simd.hpp
+++ b/modules/core/src/convert_scale.simd.hpp
@@ -53,38 +53,18 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     }
 }
 
-// variant for conversions 16f <-> ... w/o unrolling
-template<typename _Ts, typename _Td> inline void
-cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
-             Size size, float a, float b )
+static void
+cvtabs_32f( const bool* src_, size_t sstep,
+            uchar* dst, size_t dstep,
+            Size size, float a, float b )
 {
-#if CV_SIMD
-    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
-#endif
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
+    const uchar* src = (const uchar*)src_;
+    uchar v0 = saturate_cast<uchar>(std::abs(b));
+    uchar v1 = saturate_cast<uchar>(std::abs(a + b));
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
-        int j = 0;
-#if CV_SIMD
-        for( ; j < size.width; j += VECSZ )
-        {
-            if( j > size.width - VECSZ )
-            {
-                if( j == 0 || src == (_Ts*)dst )
-                    break;
-                j = size.width - VECSZ;
-            }
-            v_float32 v0;
-            vx_load_as(src + j, v0);
-            v0 = v_fma(v0, va, vb);
-            v_store_as(dst + j, v_abs(v0));
-        }
-#endif
-        for( ; j < size.width; j++ )
-            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+        for (int j = 0; j < size.width; j++)
+            dst[j] = src[j] != 0 ? v1 : v0;
     }
 }
 
@@ -217,145 +197,454 @@ static void cvtScale##suffix( const uchar* src_, size_t sstep, const uchar*, siz
     cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
 
+#define DEF_CVT_SCALE2BOOL_FUNC(suffix, stype, wtype) \
+static void cvtScale##suffix( const uchar* src_, size_t sstep, const uchar*, size_t, \
+                              uchar* dst, size_t dstep, Size size, void* scale_) \
+{ \
+    const stype* src = (const stype*)src_; \
+    const double* scale = (const double*)scale_; \
+    wtype a = (wtype)scale[0], b = (wtype)scale[1]; \
+    sstep /= sizeof(src[0]); \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) \
+        for (int j = 0; j < size.width; j++) \
+            dst[j] = (bool)((wtype)src[j]*a + b != 0); \
+}
+
+#define DEF_CVT_SCALEBOOL2_FUNC(suffix, dtype, wtype) \
+static void cvtScale##suffix( const uchar* src, size_t sstep, const uchar*, size_t, \
+                              uchar* dst_, size_t dstep, Size size, void* scale_) \
+{ \
+    dtype* dst = (dtype*)dst_; \
+    const double* scale = (const double*)scale_; \
+    wtype a = (wtype)scale[0], b = (wtype)scale[1]; \
+    dtype v0 = saturate_cast<dtype>(b), v1 = saturate_cast<dtype>(a + b); \
+    dstep /= sizeof(dst[0]); \
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) \
+        for (int j = 0; j < size.width; j++) \
+            dst[j] = src[j] != 0 ? v1 : v0; \
+}
+
 DEF_CVT_SCALE_ABS_FUNC(8u,    cvtabs_32f, uchar,  uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(8s8u,  cvtabs_32f, schar,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8b8u,  cvtabs_32f, bool,  uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtabs_32f, ushort, uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtabs_32f, short,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32u8u, cvtabs_32f, unsigned, uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtabs_32f, int,    uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtabs_32f, float,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(64u8u, cvtabs_32f, uint64_t, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(64s8u, cvtabs_32f, int64_t, uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtabs_32f, double, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16f8u, cvtabs_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16bf8u, cvtabs_32f, bfloat16_t, uchar, float)
 
 DEF_CVT_SCALE_FUNC(8u,     cvt_32f, uchar,  uchar, float)
 DEF_CVT_SCALE_FUNC(8s8u,   cvt_32f, schar,  uchar, float)
 DEF_CVT_SCALE_FUNC(16u8u,  cvt_32f, ushort, uchar, float)
 DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
+DEF_CVT_SCALE_FUNC(32u8u,  cvt_32f, unsigned, uchar, float)
 DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
 DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
 DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
+DEF_CVT_SCALE_FUNC(64u8u,  cvt_32f, uint64_t, uchar, float)
+DEF_CVT_SCALE_FUNC(64s8u,  cvt_32f, int64_t, uchar, float)
 DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_FUNC(16bf8u, cvt_32f, bfloat16_t, uchar, float)
 
 DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
 DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
 DEF_CVT_SCALE_FUNC(16u8s,  cvt_32f, ushort, schar, float)
 DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
+DEF_CVT_SCALE_FUNC(32u8s,  cvt_32f, unsigned, schar, float)
 DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
 DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
 DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
+DEF_CVT_SCALE_FUNC(64u8s,  cvt_32f, uint64_t, schar, float)
+DEF_CVT_SCALE_FUNC(64s8s,  cvt_32f, int64_t, schar, float)
 DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
+DEF_CVT_SCALE_FUNC(16bf8s, cvt_32f, bfloat16_t, schar, float)
+
+DEF_CVT_SCALE2BOOL_FUNC(8u8b, uchar, float)
+DEF_CVT_SCALE2BOOL_FUNC(8s8b, schar, float)
+DEF_CVT_SCALE2BOOL_FUNC(16u8b, ushort, float)
+DEF_CVT_SCALE2BOOL_FUNC(16s8b, short, float)
+DEF_CVT_SCALE2BOOL_FUNC(32u8b, unsigned, float)
+DEF_CVT_SCALE2BOOL_FUNC(32s8b, int, float)
+DEF_CVT_SCALE2BOOL_FUNC(32f8b, float, float)
+DEF_CVT_SCALE2BOOL_FUNC(64f8b, double, float)
+DEF_CVT_SCALE2BOOL_FUNC(64u8b, uint64_t, float)
+DEF_CVT_SCALE2BOOL_FUNC(64s8b, int64_t, float)
+DEF_CVT_SCALE2BOOL_FUNC(16f8b, float16_t, float)
+DEF_CVT_SCALE2BOOL_FUNC(16bf8b, bfloat16_t, float)
 
 DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
 DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
 DEF_CVT_SCALE_FUNC(16u,    cvt_32f, ushort, ushort, float)
 DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
+DEF_CVT_SCALE_FUNC(32u16u, cvt_32f, unsigned, ushort, float)
 DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
 DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
 DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
+DEF_CVT_SCALE_FUNC(64u16u, cvt_32f, uint64_t, ushort, float)
+DEF_CVT_SCALE_FUNC(64s16u, cvt_32f, int64_t, ushort, float)
 DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
+DEF_CVT_SCALE_FUNC(16bf16u, cvt1_32f, bfloat16_t, ushort, float)
 
 DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
 DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
 DEF_CVT_SCALE_FUNC(16u16s, cvt_32f, ushort, short, float)
 DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
+DEF_CVT_SCALE_FUNC(32u16s, cvt_32f, unsigned, short, float)
 DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
 DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
 DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
+DEF_CVT_SCALE_FUNC(64u16s, cvt_32f, uint64_t, short, float)
+DEF_CVT_SCALE_FUNC(64s16s, cvt_32f, int64_t, short, float)
 DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
+DEF_CVT_SCALE_FUNC(16bf16s, cvt1_32f, bfloat16_t, short, float)
+
+DEF_CVT_SCALE_FUNC(8u32u,  cvt_32f, uchar,  unsigned, float)
+DEF_CVT_SCALE_FUNC(8s32u,  cvt_32f, schar,  unsigned, float)
+DEF_CVT_SCALE_FUNC(16u32u, cvt_32f, ushort, unsigned, float)
+DEF_CVT_SCALE_FUNC(16s32u, cvt_32f, short,  unsigned, float)
+DEF_CVT_SCALE_FUNC(32u, cvt_32f, unsigned, unsigned, float)
+DEF_CVT_SCALE_FUNC(32s32u, cvt_64f, int,    unsigned, double)
+DEF_CVT_SCALE_FUNC(32f32u, cvt_32f, float,  unsigned, float)
+DEF_CVT_SCALE_FUNC(64f32u, cvt_64f, double, unsigned, double)
+DEF_CVT_SCALE_FUNC(64u32u, cvt_32f, uint64_t, unsigned, float)
+DEF_CVT_SCALE_FUNC(64s32u, cvt_32f, int64_t, unsigned, float)
+DEF_CVT_SCALE_FUNC(16f32u, cvt1_32f, float16_t, unsigned, float)
+DEF_CVT_SCALE_FUNC(16bf32u, cvt1_32f, bfloat16_t, unsigned, float)
 
 DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
 DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
 DEF_CVT_SCALE_FUNC(16u32s, cvt_32f, ushort, int, float)
 DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
+DEF_CVT_SCALE_FUNC(32u32s, cvt_32f, unsigned, int, float)
 DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
 DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
 DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
+DEF_CVT_SCALE_FUNC(64u32s, cvt_32f, uint64_t, int, float)
+DEF_CVT_SCALE_FUNC(64s32s, cvt_32f, int64_t, int, float)
 DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
+DEF_CVT_SCALE_FUNC(16bf32s, cvt1_32f, bfloat16_t, int, float)
 
 DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
 DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
 DEF_CVT_SCALE_FUNC(16u32f, cvt_32f, ushort, float, float)
 DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
+DEF_CVT_SCALE_FUNC(32u32f, cvt_32f, unsigned, float, float)
 DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
 DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
 DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
+DEF_CVT_SCALE_FUNC(64u32f, cvt_32f, uint64_t, float, float)
+DEF_CVT_SCALE_FUNC(64s32f, cvt_32f, int64_t, float, float)
 DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
+DEF_CVT_SCALE_FUNC(16bf32f, cvt1_32f, bfloat16_t, float, float)
 
 DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
 DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
 DEF_CVT_SCALE_FUNC(16u64f, cvt_64f, ushort, double, double)
 DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
+DEF_CVT_SCALE_FUNC(32u64f, cvt_64f, unsigned, double, double)
 DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
 DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
 DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
+DEF_CVT_SCALE_FUNC(64u64f, cvt_64f, uint64_t, double, double)
+DEF_CVT_SCALE_FUNC(64s64f, cvt_64f, int64_t, double, double)
 DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
+DEF_CVT_SCALE_FUNC(16bf64f, cvt_64f, bfloat16_t, double, double)
+
+DEF_CVT_SCALE_FUNC(8u64u,  cvt_64f, uchar,  uint64_t, double)
+DEF_CVT_SCALE_FUNC(8s64u,  cvt_64f, schar,  uint64_t, double)
+DEF_CVT_SCALE_FUNC(16u64u, cvt_64f, ushort, uint64_t, double)
+DEF_CVT_SCALE_FUNC(16s64u, cvt_64f, short,  uint64_t, double)
+DEF_CVT_SCALE_FUNC(32u64u, cvt_64f, unsigned, uint64_t, double)
+DEF_CVT_SCALE_FUNC(32s64u, cvt_64f, int,    uint64_t, double)
+DEF_CVT_SCALE_FUNC(32f64u, cvt_64f, float,  uint64_t, double)
+DEF_CVT_SCALE_FUNC(64f64u, cvt_64f, double, uint64_t, double)
+DEF_CVT_SCALE_FUNC(64u, cvt_64f, uint64_t, uint64_t, double)
+DEF_CVT_SCALE_FUNC(64s64u, cvt_64f, int64_t, uint64_t, double)
+DEF_CVT_SCALE_FUNC(16f64u, cvt_64f, float16_t, uint64_t, double)
+DEF_CVT_SCALE_FUNC(16bf64u, cvt_64f, bfloat16_t, uint64_t, double)
+
+DEF_CVT_SCALE_FUNC(8u64s,  cvt_64f, uchar,  int64_t, double)
+DEF_CVT_SCALE_FUNC(8s64s,  cvt_64f, schar,  int64_t, double)
+DEF_CVT_SCALE_FUNC(16u64s, cvt_64f, ushort, int64_t, double)
+DEF_CVT_SCALE_FUNC(16s64s, cvt_64f, short,  int64_t, double)
+DEF_CVT_SCALE_FUNC(32u64s, cvt_64f, unsigned, int64_t, double)
+DEF_CVT_SCALE_FUNC(32s64s, cvt_64f, int,    int64_t, double)
+DEF_CVT_SCALE_FUNC(32f64s, cvt_64f, float,  int64_t, double)
+DEF_CVT_SCALE_FUNC(64f64s, cvt_64f, double, int64_t, double)
+DEF_CVT_SCALE_FUNC(64u64s, cvt_64f, uint64_t, int64_t, double)
+DEF_CVT_SCALE_FUNC(64s, cvt_64f, int64_t, int64_t, double)
+DEF_CVT_SCALE_FUNC(16f64s, cvt_64f, float16_t, int64_t, double)
+DEF_CVT_SCALE_FUNC(16bf64s, cvt_64f, bfloat16_t, int64_t, double)
 
 DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
 DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
+DEF_CVT_SCALE_FUNC(32u16f, cvt1_32f, unsigned, float16_t, float)
 DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
 DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
-DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
+DEF_CVT_SCALE_FUNC(64f16f, cvt1_32f, double, float16_t, float)
+DEF_CVT_SCALE_FUNC(64u16f, cvt1_32f, uint64_t, float16_t, float)
+DEF_CVT_SCALE_FUNC(64s16f, cvt1_32f, int64_t, float16_t, float)
 DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)
+DEF_CVT_SCALE_FUNC(16bf16f, cvt1_32f, bfloat16_t, float16_t, float)
+
+DEF_CVT_SCALE_FUNC(8u16bf,  cvt1_32f, uchar,  bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(8s16bf,  cvt1_32f, schar,  bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(16u16bf, cvt1_32f, ushort, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(16s16bf, cvt1_32f, short,  bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(32u16bf, cvt1_32f, unsigned, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(32s16bf, cvt1_32f, int,    bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(32f16bf, cvt1_32f, float,  bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(64f16bf, cvt1_32f, double, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(64u16bf, cvt1_32f, uint64_t, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(64s16bf, cvt1_32f, int64_t, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(16f16bf, cvt1_32f, float16_t, bfloat16_t, float)
+DEF_CVT_SCALE_FUNC(16bf, cvt1_32f, bfloat16_t, bfloat16_t, float)
+
+DEF_CVT_SCALEBOOL2_FUNC(8b8u, uchar, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b8s, schar, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b, bool, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b16u, ushort, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b16s, short, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b32u, unsigned, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b32s, int, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b32f, float, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b64u, uint64_t, double)
+DEF_CVT_SCALEBOOL2_FUNC(8b64s, int64_t, double)
+DEF_CVT_SCALEBOOL2_FUNC(8b64f, double, double)
+DEF_CVT_SCALEBOOL2_FUNC(8b16f, float16_t, float)
+DEF_CVT_SCALEBOOL2_FUNC(8b16bf, bfloat16_t, float)
 
 BinaryFunc getCvtScaleAbsFunc(int depth)
 {
-    static BinaryFunc cvtScaleAbsTab[] =
-    {
-        (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
-        (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
-        (BinaryFunc)cvtScaleAbs64f8u, 0
-    };
-
-    return cvtScaleAbsTab[depth];
+    BinaryFunc func =
+        depth == CV_8U ? (BinaryFunc)cvtScaleAbs8u :
+        depth == CV_8S ? (BinaryFunc)cvtScaleAbs8s8u :
+        depth == CV_Bool ? (BinaryFunc)cvtScaleAbs8b8u :
+        depth == CV_16U ? (BinaryFunc)cvtScaleAbs16u8u :
+        depth == CV_16S ? (BinaryFunc)cvtScaleAbs16s8u :
+        depth == CV_16F ? (BinaryFunc)cvtScaleAbs16f8u :
+        depth == CV_16BF ? (BinaryFunc)cvtScaleAbs16bf8u :
+        depth == CV_32U ? (BinaryFunc)cvtScaleAbs32u8u :
+        depth == CV_32S ? (BinaryFunc)cvtScaleAbs32s8u :
+        depth == CV_32F ? (BinaryFunc)cvtScaleAbs32f8u :
+        depth == CV_64U ? (BinaryFunc)cvtScaleAbs64u8u :
+        depth == CV_64S ? (BinaryFunc)cvtScaleAbs64s8u :
+        depth == CV_64F ? (BinaryFunc)cvtScaleAbs64f8u : 0;
+    CV_Assert(func != 0);
+    return func;
 }
 
-BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
+BinaryFunc getConvertScaleFunc(int sdepth_, int ddepth_)
 {
-    static BinaryFunc cvtScaleTab[][8] =
-    {
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u
-        },
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s
-        },
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u
-        },
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s
-        },
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s
-        },
-        {
-            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
-            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f
-        },
-        {
-            (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
-            (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f
-        },
-        {
-            (BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
-            (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
-            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f
-        },
-    };
+    int sdepth = CV_MAT_DEPTH(sdepth_);
+    int ddepth = CV_MAT_DEPTH(ddepth_);
+    BinaryFunc func =
+        ddepth == CV_8U ? (
+            sdepth == CV_8U ? cvtScale8u :
+            sdepth == CV_8S ? cvtScale8s8u :
+            sdepth == CV_Bool ? cvtScale8b8u :
+            sdepth == CV_16U ? cvtScale16u8u :
+            sdepth == CV_16S ? cvtScale16s8u :
+            sdepth == CV_32U ? cvtScale32u8u :
+            sdepth == CV_32S ? cvtScale32s8u :
+            sdepth == CV_32F ? cvtScale32f8u :
+            sdepth == CV_64F ? cvtScale64f8u :
+            sdepth == CV_16F ? cvtScale16f8u :
+            sdepth == CV_16BF ? cvtScale16bf8u :
+            sdepth == CV_64U ? cvtScale64u8u :
+            sdepth == CV_64S ? cvtScale64s8u :
+            0) :
+        ddepth == CV_8S ? (
+            sdepth == CV_8U ? cvtScale8u8s :
+            sdepth == CV_8S ? cvtScale8s :
+            sdepth == CV_Bool ? cvtScale8b8s :
+            sdepth == CV_16U ? cvtScale16u8s :
+            sdepth == CV_16S ? cvtScale16s8s :
+            sdepth == CV_32U ? cvtScale32u8s :
+            sdepth == CV_32S ? cvtScale32s8s :
+            sdepth == CV_32F ? cvtScale32f8s :
+            sdepth == CV_64F ? cvtScale64f8s :
+            sdepth == CV_16F ? cvtScale16f8s :
+            sdepth == CV_16BF ? cvtScale16bf8s :
+            sdepth == CV_64U ? cvtScale64u8s :
+            sdepth == CV_64S ? cvtScale64s8s :
+            0) :
+        ddepth == CV_16U ? (
+            sdepth == CV_8U ? cvtScale8u16u :
+            sdepth == CV_8S ? cvtScale8s16u :
+            sdepth == CV_Bool ? cvtScale8b16u :
+            sdepth == CV_16U ? cvtScale16u :
+            sdepth == CV_16S ? cvtScale16s16u :
+            sdepth == CV_32U ? cvtScale32u16u :
+            sdepth == CV_32S ? cvtScale32s16u :
+            sdepth == CV_32F ? cvtScale32f16u :
+            sdepth == CV_64F ? cvtScale64f16u :
+            sdepth == CV_16F ? cvtScale16f16u :
+            sdepth == CV_16BF ? cvtScale16bf16u :
+            sdepth == CV_64U ? cvtScale64u16u :
+            sdepth == CV_64S ? cvtScale64s16u :
+            0) :
+        ddepth == CV_16S ? (
+            sdepth == CV_8U ? cvtScale8u16s :
+            sdepth == CV_8S ? cvtScale8s16s :
+            sdepth == CV_Bool ? cvtScale8b16s :
+            sdepth == CV_16U ? cvtScale16u16s :
+            sdepth == CV_16S ? cvtScale16s :
+            sdepth == CV_32U ? cvtScale32u16s :
+            sdepth == CV_32S ? cvtScale32s16s :
+            sdepth == CV_32F ? cvtScale32f16s :
+            sdepth == CV_64F ? cvtScale64f16s :
+            sdepth == CV_16F ? cvtScale16f16s :
+            sdepth == CV_16BF ? cvtScale16bf16s :
+            sdepth == CV_64U ? cvtScale64u16s :
+            sdepth == CV_64S ? cvtScale64s16s :
+            0) :
+        ddepth == CV_32U ? (
+            sdepth == CV_8U ? cvtScale8u32u :
+            sdepth == CV_8S ? cvtScale8s32u :
+            sdepth == CV_Bool ? cvtScale8b32u :
+            sdepth == CV_16U ? cvtScale16u32u :
+            sdepth == CV_16S ? cvtScale16s32u :
+            sdepth == CV_32U ? cvtScale32u :
+            sdepth == CV_32S ? cvtScale32s32u :
+            sdepth == CV_32F ? cvtScale32f32u :
+            sdepth == CV_64F ? cvtScale64f32u :
+            sdepth == CV_16F ? cvtScale16f32u :
+            sdepth == CV_16BF ? cvtScale16bf32u :
+            sdepth == CV_64U ? cvtScale64u32u :
+            sdepth == CV_64S ? cvtScale64s32u :
 
-    return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
+            0) :
+        ddepth == CV_32S ? (
+            sdepth == CV_8U ? cvtScale8u32s :
+            sdepth == CV_8S ? cvtScale8s32s :
+            sdepth == CV_Bool ? cvtScale8b32s :
+            sdepth == CV_16U ? cvtScale16u32s :
+            sdepth == CV_16S ? cvtScale16s32s :
+            sdepth == CV_32U ? cvtScale32u32s :
+            sdepth == CV_32S ? cvtScale32s :
+            sdepth == CV_32F ? cvtScale32f32s :
+            sdepth == CV_64F ? cvtScale64f32s :
+            sdepth == CV_16F ? cvtScale16f32s :
+            sdepth == CV_16BF ? cvtScale16bf32s :
+            sdepth == CV_64U ? cvtScale64u32s :
+            sdepth == CV_64S ? cvtScale64s32s :
+            0) :
+        ddepth == CV_32F ? (
+            sdepth == CV_8U ? cvtScale8u32f :
+            sdepth == CV_8S ? cvtScale8s32f :
+            sdepth == CV_Bool ? cvtScale8b32f :
+            sdepth == CV_16U ? cvtScale16u32f :
+            sdepth == CV_16S ? cvtScale16s32f :
+            sdepth == CV_32U ? cvtScale32u32f :
+            sdepth == CV_32S ? cvtScale32s32f :
+            sdepth == CV_32F ? cvtScale32f :
+            sdepth == CV_64F ? cvtScale64f32f :
+            sdepth == CV_16F ? cvtScale16f32f :
+            sdepth == CV_16BF ? cvtScale16bf32f :
+            sdepth == CV_64U ? cvtScale64u32f :
+            sdepth == CV_64S ? cvtScale64s32f :
+            0) :
+        ddepth == CV_64F ? (
+            sdepth == CV_8U ? cvtScale8u64f :
+            sdepth == CV_8S ? cvtScale8s64f :
+            sdepth == CV_Bool ? cvtScale8b64f :
+            sdepth == CV_16U ? cvtScale16u64f :
+            sdepth == CV_16S ? cvtScale16s64f :
+            sdepth == CV_32U ? cvtScale32u64f :
+            sdepth == CV_32S ? cvtScale32s64f :
+            sdepth == CV_32F ? cvtScale32f64f :
+            sdepth == CV_64F ? cvtScale64f :
+            sdepth == CV_16F ? cvtScale16f64f :
+            sdepth == CV_16BF ? cvtScale16bf64f :
+            sdepth == CV_64U ? cvtScale64u64f :
+            sdepth == CV_64S ? cvtScale64s64f :
+            0) :
+        ddepth == CV_16F ? (
+            sdepth == CV_8U ? cvtScale8u16f :
+            sdepth == CV_8S ? cvtScale8s16f :
+            sdepth == CV_Bool ? cvtScale8b16f :
+            sdepth == CV_16U ? cvtScale16u16f :
+            sdepth == CV_16S ? cvtScale16s16f :
+            sdepth == CV_32U ? cvtScale32u16f :
+            sdepth == CV_32S ? cvtScale32s16f :
+            sdepth == CV_32F ? cvtScale32f16f :
+            sdepth == CV_64F ? cvtScale64f16f :
+            sdepth == CV_16F ? cvtScale16f :
+            sdepth == CV_16BF ? cvtScale16bf16f :
+            sdepth == CV_64U ? cvtScale64u16f :
+            sdepth == CV_64S ? cvtScale64s16f :
+            0) :
+        ddepth == CV_16BF ? (
+            sdepth == CV_8U ? cvtScale8u16bf :
+            sdepth == CV_8S ? cvtScale8s16bf :
+            sdepth == CV_Bool ? cvtScale8b16bf :
+            sdepth == CV_16U ? cvtScale16u16bf :
+            sdepth == CV_16S ? cvtScale16s16bf :
+            sdepth == CV_32U ? cvtScale32u16bf :
+            sdepth == CV_32S ? cvtScale32s16bf :
+            sdepth == CV_32F ? cvtScale32f16bf :
+            sdepth == CV_64F ? cvtScale64f16bf :
+            sdepth == CV_16F ? cvtScale16f16bf :
+            sdepth == CV_16BF ? cvtScale16bf :
+            sdepth == CV_64U ? cvtScale64u16bf :
+            sdepth == CV_64S ? cvtScale64s16bf :
+            0) :
+        ddepth == CV_Bool ? (
+            sdepth == CV_8U ? cvtScale8u8b :
+            sdepth == CV_8S ? cvtScale8s8b :
+            sdepth == CV_Bool ? cvtScale8b :
+            sdepth == CV_16U ? cvtScale16u8b :
+            sdepth == CV_16S ? cvtScale16s8b :
+            sdepth == CV_32U ? cvtScale32u8b :
+            sdepth == CV_32S ? cvtScale32s8b :
+            sdepth == CV_32F ? cvtScale32f8b :
+            sdepth == CV_64F ? cvtScale64f8b :
+            sdepth == CV_16F ? cvtScale16f8b :
+            sdepth == CV_16BF ? cvtScale16bf8b :
+            sdepth == CV_64U ? cvtScale64u8b :
+            sdepth == CV_64S ? cvtScale64s8b :
+            0) :
+        ddepth == CV_64U ? (
+            sdepth == CV_8U ? cvtScale8u64u :
+            sdepth == CV_8S ? cvtScale8s64u :
+            sdepth == CV_Bool ? cvtScale8b64u :
+            sdepth == CV_16U ? cvtScale16u64u :
+            sdepth == CV_16S ? cvtScale16s64u :
+            sdepth == CV_32U ? cvtScale32u64u :
+            sdepth == CV_32S ? cvtScale32s64u :
+            sdepth == CV_32F ? cvtScale32f64u :
+            sdepth == CV_64F ? cvtScale64f64u :
+            sdepth == CV_16F ? cvtScale16f64u :
+            sdepth == CV_16BF ? cvtScale16bf64u :
+            sdepth == CV_64U ? cvtScale64u :
+            sdepth == CV_64S ? cvtScale64s64u :
+            0) :
+        ddepth == CV_64S ? (
+            sdepth == CV_8U ? cvtScale8u64s :
+            sdepth == CV_8S ? cvtScale8s64s :
+            sdepth == CV_Bool ? cvtScale8b64s :
+            sdepth == CV_16U ? cvtScale16u64s :
+            sdepth == CV_16S ? cvtScale16s64s :
+            sdepth == CV_32U ? cvtScale32u64s :
+            sdepth == CV_32S ? cvtScale32s64s :
+            sdepth == CV_32F ? cvtScale32f64s :
+            sdepth == CV_64F ? cvtScale64f64s :
+            sdepth == CV_16F ? cvtScale16f64s :
+            sdepth == CV_16BF ? cvtScale16bf64s :
+            sdepth == CV_64U ? cvtScale64u64s :
+            sdepth == CV_64S ? cvtScale64s :
+            0) :
+        0;
+    CV_Assert(func != 0);
+    return func;
 }
 
 #endif
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 3a6a1a7ac6..09250b8585 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -72,28 +72,43 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
     switch(depth)
     {
     case CV_8U:
-        scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
+        scalarToRawData_(s, (uchar*)_buf, cn, unroll_to);
         break;
     case CV_8S:
-        scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
+        scalarToRawData_(s, (schar*)_buf, cn, unroll_to);
+        break;
+    case CV_Bool:
+        scalarToRawData_(s, (bool*)_buf, cn, unroll_to);
         break;
     case CV_16U:
-        scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
+        scalarToRawData_(s, (ushort*)_buf, cn, unroll_to);
         break;
     case CV_16S:
-        scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
-        break;
-    case CV_32S:
-        scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
-        break;
-    case CV_32F:
-        scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
-        break;
-    case CV_64F:
-        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
+        scalarToRawData_(s, (short*)_buf, cn, unroll_to);
         break;
     case CV_16F:
-        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        scalarToRawData_(s, (float16_t*)_buf, cn, unroll_to);
+        break;
+    case CV_16BF:
+        scalarToRawData_(s, (bfloat16_t*)_buf, cn, unroll_to);
+        break;
+    case CV_32U:
+        scalarToRawData_(s, (unsigned*)_buf, cn, unroll_to);
+        break;
+    case CV_32S:
+        scalarToRawData_(s, (int*)_buf, cn, unroll_to);
+        break;
+    case CV_32F:
+        scalarToRawData_(s, (float*)_buf, cn, unroll_to);
+        break;
+    case CV_64U:
+        scalarToRawData_(s, (uint64_t*)_buf, cn, unroll_to);
+        break;
+    case CV_64S:
+        scalarToRawData_(s, (int64_t*)_buf, cn, unroll_to);
+        break;
+    case CV_64F:
+        scalarToRawData_(s, (double*)_buf, cn, unroll_to);
         break;
     default:
         CV_Error(CV_StsUnsupportedFormat,"");
diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp
index a213ca06c7..3758f6816d 100644
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@@ -647,7 +647,7 @@ void scaleAdd(InputArray _src1, double alpha, InputArray _src2, OutputArray _dst
     CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
             ocl_scaleAdd(_src1, alpha, _src2, _dst, type))
 
-    if( depth < CV_32F )
+    if( depth != CV_32F && depth != CV_64F )
     {
         addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
         return;
@@ -979,7 +979,7 @@ typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
 
 static DotProdFunc getDotProdFunc(int depth)
 {
-    static DotProdFunc dotProdTab[] =
+    static DotProdFunc dotProdTab[CV_DEPTH_MAX] =
     {
         (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
         (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 5a7f36d12b..e32096cf71 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -1791,7 +1791,7 @@ diagtransform_64f(const double* src, double* dst, const double* m, int len, int
 
 TransformFunc getTransformFunc(int depth)
 {
-    static TransformFunc transformTab[] =
+    static TransformFunc transformTab[CV_DEPTH_MAX] =
     {
         (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
         (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
@@ -1803,7 +1803,7 @@ TransformFunc getTransformFunc(int depth)
 
 TransformFunc getDiagTransformFunc(int depth)
 {
-    static TransformFunc diagTransformTab[] =
+    static TransformFunc diagTransformTab[CV_DEPTH_MAX] =
     {
         (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
         (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 8111dc2230..704979b714 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1151,7 +1151,7 @@ Mat Mat::reshape(int new_cn, int new_rows) const
         }
         if( new_rows > 0 )
         {
-            int sz[] = { new_rows, (int)(total()/new_rows) };
+            int sz[] = { new_rows, (int)(total()*cn/new_rows) };
             return reshape(new_cn, 2, sz);
         }
     }
diff --git a/modules/core/src/mean.simd.hpp b/modules/core/src/mean.simd.hpp
index bb815adc1c..60dba7afcf 100644
--- a/modules/core/src/mean.simd.hpp
+++ b/modules/core/src/mean.simd.hpp
@@ -311,7 +311,7 @@ static int sqsum64f( const double* src, const uchar* mask, double* sum, double*
 SumSqrFunc getSumSqrFunc(int depth)
 {
     CV_INSTRUMENT_REGION();
-    static SumSqrFunc sumSqrTab[] =
+    static SumSqrFunc sumSqrTab[CV_DEPTH_MAX] =
     {
         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
diff --git a/modules/core/src/merge.dispatch.cpp b/modules/core/src/merge.dispatch.cpp
index b95dc7345d..6b8c2d8135 100644
--- a/modules/core/src/merge.dispatch.cpp
+++ b/modules/core/src/merge.dispatch.cpp
@@ -50,12 +50,15 @@ typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
 
 static MergeFunc getMergeFunc(int depth)
 {
-    static MergeFunc mergeTab[] =
+    static MergeFunc mergeTab[CV_DEPTH_MAX] =
     {
         (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
         (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
         (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s),
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u)
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), 0, 0, 0,
     };
 
     return mergeTab[depth];
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index 092c5e9234..d694d99d3c 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -1002,7 +1002,8 @@ bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc
 
     CV_Assert(!haveSrc2 || _src2.type() == type);
 
-    if (depth == CV_32S)
+    if (depth == CV_32S || depth == CV_8S || depth == CV_32U || depth == CV_64U ||
+        depth == CV_64S || depth == CV_16F || depth == CV_16BF)
         return false;
 
     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index 69da85f291..72d6fd9abc 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -367,7 +367,7 @@ typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, in
 
 static NormFunc getNormFunc(int normType, int depth)
 {
-    static NormFunc normTab[3][8] =
+    static NormFunc normTab[3][CV_DEPTH_MAX] =
     {
         {
             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
@@ -388,7 +388,7 @@ static NormFunc getNormFunc(int normType, int depth)
 
 static NormDiffFunc getNormDiffFunc(int normType, int depth)
 {
-    static NormDiffFunc normDiffTab[3][8] =
+    static NormDiffFunc normDiffTab[3][CV_DEPTH_MAX] =
     {
         {
             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 8a7d7e1636..1307ff9d03 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -70,14 +70,19 @@ namespace cv
         char braces[5];
 
         void (FormattedImpl::*valueToStr)();
+        void valueToStrBool() { snprintf(buf, sizeof(buf), "%d", (int)mtx.ptr<uchar>(row, col)[cn] != 0); }
         void valueToStr8u()  { snprintf(buf, sizeof(buf), "%3d", (int)mtx.ptr<uchar>(row, col)[cn]); }
         void valueToStr8s()  { snprintf(buf, sizeof(buf), "%3d", (int)mtx.ptr<schar>(row, col)[cn]); }
         void valueToStr16u() { snprintf(buf, sizeof(buf), "%d", (int)mtx.ptr<ushort>(row, col)[cn]); }
         void valueToStr16s() { snprintf(buf, sizeof(buf), "%d", (int)mtx.ptr<short>(row, col)[cn]); }
+        void valueToStr32u() { snprintf(buf, sizeof(buf), "%u", mtx.ptr<unsigned>(row, col)[cn]); }
         void valueToStr32s() { snprintf(buf, sizeof(buf), "%d", mtx.ptr<int>(row, col)[cn]); }
         void valueToStr32f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<float>(row, col)[cn]); }
         void valueToStr64f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<double>(row, col)[cn]); }
+        void valueToStr64u() { snprintf(buf, sizeof(buf), "%llu", (unsigned long long)mtx.ptr<uint64_t>(row, col)[cn]); }
+        void valueToStr64s() { snprintf(buf, sizeof(buf), "%lld", (long long)mtx.ptr<int64_t>(row, col)[cn]); }
         void valueToStr16f() { snprintf(buf, sizeof(buf), floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
+        void valueToStr16bf() { snprintf(buf, sizeof(buf), floatFormat, (float)mtx.ptr<bfloat16_t>(row, col)[cn]); }
         void valueToStrOther() { buf[0] = 0; }
 
     public:
@@ -111,13 +116,19 @@ namespace cv
             {
                 case CV_8U:  valueToStr = &FormattedImpl::valueToStr8u; break;
                 case CV_8S:  valueToStr = &FormattedImpl::valueToStr8s; break;
+                case CV_Bool: valueToStr = &FormattedImpl::valueToStrBool; break;
                 case CV_16U: valueToStr = &FormattedImpl::valueToStr16u; break;
                 case CV_16S: valueToStr = &FormattedImpl::valueToStr16s; break;
+                case CV_32U: valueToStr = &FormattedImpl::valueToStr32u; break;
                 case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break;
                 case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break;
                 case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break;
-                default:     CV_Assert(mtx.depth() == CV_16F);
-                             valueToStr = &FormattedImpl::valueToStr16f;
+                case CV_64U: valueToStr = &FormattedImpl::valueToStr64u; break;
+                case CV_64S: valueToStr = &FormattedImpl::valueToStr64s; break;
+                case CV_16F: valueToStr = &FormattedImpl::valueToStr16f; break;
+                case CV_16BF: valueToStr = &FormattedImpl::valueToStr16bf; break;
+                default:
+                    CV_Error_(Error::StsError, ("unsupported matrix type %d\n", mtx.depth()));
             }
         }
 
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 0d64bab094..cf0a6466ea 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -56,6 +56,28 @@ char* itoa( int _val, char* buffer, int /*radix*/ )
     return ptr;
 }
 
+char* itoa( int64_t _val, char* buffer, int /*radix*/, bool _signed)
+{
+    const int radix = 10;
+    char* ptr=buffer + 23 /* enough even for 64-bit integers */;
+    int sign = _signed && _val < 0 ? -1 : 1;
+    uint64_t val = !_signed ? (uint64_t)_val : abs(_val);
+
+    *ptr = '\0';
+    do
+    {
+        uint64_t r = val / radix;
+        *--ptr = (char)(val - (r*radix) + '0');
+        val = r;
+    }
+    while( val != 0 );
+
+    if( sign < 0 )
+        *--ptr = '-';
+
+    return ptr;
+}
+
 char* doubleToString( char* buf, size_t bufSize, double value, bool explicitZero )
 {
     Cv64suf val;
@@ -142,12 +164,12 @@ char* floatToString( char* buf, size_t bufSize, float value, bool halfprecision,
     return buf;
 }
 
-static const char symbols[9] = "ucwsifdh";
+static const char symbols[] = "ucwsifdhHbLUn";
 
 static char typeSymbol(int depth)
 {
     CV_StaticAssert(CV_64F == 6, "");
-    CV_CheckDepth(depth, depth >=0 && depth <= CV_16F, "");
+    CV_CheckDepth(depth, depth >= 0 && depth <= CV_32U, "");
     return symbols[depth];
 }
 
@@ -264,13 +286,18 @@ int calcStructSize( const char* dt, int initial_size )
         switch (v)
         {
         case 'u': { elem_max_size = std::max( elem_max_size, sizeof(uchar ) ); break; }
+        case 'b': { elem_max_size = std::max( elem_max_size, sizeof(bool  ) ); break; }
         case 'c': { elem_max_size = std::max( elem_max_size, sizeof(schar ) ); break; }
         case 'w': { elem_max_size = std::max( elem_max_size, sizeof(ushort) ); break; }
         case 's': { elem_max_size = std::max( elem_max_size, sizeof(short ) ); break; }
         case 'i': { elem_max_size = std::max( elem_max_size, sizeof(int   ) ); break; }
+        case 'n': { elem_max_size = std::max( elem_max_size, sizeof(unsigned) ); break; }
         case 'f': { elem_max_size = std::max( elem_max_size, sizeof(float ) ); break; }
         case 'd': { elem_max_size = std::max( elem_max_size, sizeof(double) ); break; }
-        case 'h': { elem_max_size = std::max(elem_max_size, sizeof(float16_t)); break; }
+        case 'h': { elem_max_size = std::max( elem_max_size, sizeof(float16_t)); break; }
+        case 'H': { elem_max_size = std::max( elem_max_size, sizeof(bfloat16_t)); break; }
+        case 'I': { elem_max_size = std::max( elem_max_size, sizeof(int64_t)); break; }
+        case 'U': { elem_max_size = std::max( elem_max_size, sizeof(uint64_t)); break; }
         default:
             CV_Error_(Error::StsNotImplemented, ("Unknown type identifier: '%c' in '%s'", (char)(*type), dt));
         }
@@ -1097,6 +1124,10 @@ void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, s
                         ptr = fs::itoa(*(uchar *) data, buf, 10);
                         data++;
                         break;
+                    case CV_Bool:
+                        ptr = fs::itoa(*(uchar *) data != 0, buf, 10);
+                        data++;
+                        break;
                     case CV_8S:
                         ptr = fs::itoa(*(char *) data, buf, 10);
                         data++;
@@ -1109,10 +1140,22 @@ void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, s
                         ptr = fs::itoa(*(short *) data, buf, 10);
                         data += sizeof(short);
                         break;
+                    case CV_32U:
+                        ptr = fs::itoa((int64_t)*(unsigned*) data, buf, 10, false);
+                        data += sizeof(unsigned);
+                        break;
                     case CV_32S:
                         ptr = fs::itoa(*(int *) data, buf, 10);
                         data += sizeof(int);
                         break;
+                    case CV_64U:
+                        ptr = fs::itoa(*(uint64_t*) data, buf, 10, false);
+                        data += sizeof(uint64_t);
+                        break;
+                    case CV_64S:
+                        ptr = fs::itoa(*(int64_t*) data, buf, 10, true);
+                        data += sizeof(int64_t);
+                        break;
                     case CV_32F:
                         ptr = fs::floatToString(buf, sizeof(buf), *(float *) data, false, explicitZero);
                         data += sizeof(float);
@@ -1121,10 +1164,14 @@ void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, s
                         ptr = fs::doubleToString(buf, sizeof(buf), *(double *) data, explicitZero);
                         data += sizeof(double);
                         break;
-                    case CV_16F: /* reference */
+                    case CV_16F:
                         ptr = fs::floatToString(buf, sizeof(buf), (float) *(float16_t *) data, true, explicitZero);
                         data += sizeof(float16_t);
                         break;
+                    case CV_16BF:
+                        ptr = fs::floatToString(buf, sizeof(buf), (float) *(bfloat16_t *) data, true, explicitZero);
+                        data += sizeof(bfloat16_t);
+                        break;
                     default:
                         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported type");
                         return;
@@ -2572,6 +2619,10 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(char*)data = saturate_cast<schar>(ival);
                             data++;
                             break;
+                        case CV_Bool:
+                            *(bool*)data = ival != 0;
+                            data++;
+                            break;
                         case CV_16U:
                             *(ushort*)data = saturate_cast<ushort>(ival);
                             data += sizeof(ushort);
@@ -2580,6 +2631,10 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(short*)data = saturate_cast<short>(ival);
                             data += sizeof(short);
                             break;
+                        case CV_32U:
+                            *(unsigned*)data = (unsigned)std::max(ival, 0);
+                            data += sizeof(unsigned);
+                            break;
                         case CV_32S:
                             *(int*)data = ival;
                             data += sizeof(int);
@@ -2588,6 +2643,14 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(float*)data = (float)ival;
                             data += sizeof(float);
                             break;
+                        case CV_64U:
+                            *(uint64_t*)data = (uint64_t)ival;
+                            data += sizeof(uint64_t);
+                            break;
+                        case CV_64S:
+                            *(int64_t*)data = (int64_t)ival;
+                            data += sizeof(int64_t);
+                            break;
                         case CV_64F:
                             *(double*)data = (double)ival;
                             data += sizeof(double);
@@ -2596,6 +2659,10 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(float16_t*)data = float16_t((float)ival);
                             data += sizeof(float16_t);
                             break;
+                        case CV_16BF:
+                            *(bfloat16_t*)data = bfloat16_t((float)ival);
+                            data += sizeof(bfloat16_t);
+                            break;
                         default:
                             CV_Error( Error::StsUnsupportedFormat, "Unsupported type" );
                         }
@@ -2622,6 +2689,10 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(short*)data = saturate_cast<short>(fval);
                             data += sizeof(short);
                             break;
+                        case CV_32U:
+                            *(int*)data = saturate_cast<unsigned>(fval);
+                            data += sizeof(int);
+                            break;
                         case CV_32S:
                             *(int*)data = saturate_cast<int>(fval);
                             data += sizeof(int);
@@ -2630,6 +2701,14 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(float*)data = (float)fval;
                             data += sizeof(float);
                             break;
+                        case CV_64U:
+                            *(uint64_t*)data = (uint64_t)round(std::max(fval, 0.));
+                            data += sizeof(uint64_t);
+                            break;
+                        case CV_64S:
+                            *(int64_t*)data = (int64_t)round(std::max(fval, 0.));
+                            data += sizeof(int64_t);
+                            break;
                         case CV_64F:
                             *(double*)data = fval;
                             data += sizeof(double);
@@ -2638,6 +2717,10 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             *(float16_t*)data = float16_t((float)fval);
                             data += sizeof(float16_t);
                             break;
+                        case CV_16BF:
+                            *(bfloat16_t*)data = bfloat16_t((float)fval);
+                            data += sizeof(bfloat16_t);
+                            break;
                         default:
                             CV_Error( Error::StsUnsupportedFormat, "Unsupported type" );
                         }
diff --git a/modules/core/src/persistence.hpp b/modules/core/src/persistence.hpp
index 4b579303fa..c08ddb5472 100644
--- a/modules/core/src/persistence.hpp
+++ b/modules/core/src/persistence.hpp
@@ -86,6 +86,7 @@ namespace fs
 {
 int strcasecmp(const char* str1, const char* str2);
 char* itoa( int _val, char* buffer, int /*radix*/ );
+char* itoa( int64_t _val, char* buffer, int /*radix*/, bool _signed );
 char* floatToString( char* buf, size_t bufSize, float value, bool halfprecision, bool explicitZero );
 char* doubleToString( char* buf, size_t bufSize, double value, bool explicitZero );
 
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index 3e4f761f4a..ed93f88d4f 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -51,38 +51,53 @@ namespace cv
    Multiply-with-carry generator is used here:
    temp = ( A*X(n) + carry )
    X(n+1) = temp mod (2^32)
-   carry = temp / (2^32)
+   carry = floor (temp / (2^32))
 */
 
 #define  RNG_NEXT(x)    ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
+// make it jump-less
+#define  CN_NEXT(k)     (((k) + 1) & (((k) >= cn) - 1))
+
+enum
+{
+    RNG_FLAG_SMALL = 0x40000000,
+    RNG_FLAG_STDMTX = 0x80000000
+};
 
 /***************************************************************************************\
 *                           Pseudo-Random Number Generators (PRNGs)                     *
 \***************************************************************************************/
 
 template<typename T> static void
-randBits_( T* arr, int len, uint64* state, const Vec2i* p, bool small_flag )
+randBits_( T* arr, int len, int cn, uint64* state, const Vec2l* p, int flags )
 {
+    bool small_flag = (flags & RNG_FLAG_SMALL) != 0;
     uint64 temp = *state;
-    int i;
+    int i, k = 0;
+    len *= cn;
+    --cn;
 
     if( !small_flag )
     {
         for( i = 0; i <= len - 4; i += 4 )
         {
-            int t0, t1;
+            int64_t t0, t1;
 
             temp = RNG_NEXT(temp);
-            t0 = ((int)temp & p[i][0]) + p[i][1];
+            t0 = ((int64_t)temp & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             temp = RNG_NEXT(temp);
-            t1 = ((int)temp & p[i+1][0]) + p[i+1][1];
+            t1 = ((int64_t)temp & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             arr[i] = saturate_cast<T>(t0);
             arr[i+1] = saturate_cast<T>(t1);
 
             temp = RNG_NEXT(temp);
-            t0 = ((int)temp & p[i+2][0]) + p[i+2][1];
+            t0 = ((int64_t)temp & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             temp = RNG_NEXT(temp);
-            t1 = ((int)temp & p[i+3][0]) + p[i+3][1];
+            t1 = ((int64_t)temp & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             arr[i+2] = saturate_cast<T>(t0);
             arr[i+3] = saturate_cast<T>(t1);
         }
@@ -91,16 +106,23 @@ randBits_( T* arr, int len, uint64* state, const Vec2i* p, bool small_flag )
     {
         for( i = 0; i <= len - 4; i += 4 )
         {
-            int t0, t1, t;
+            int64_t t0, t1, t;
             temp = RNG_NEXT(temp);
-            t = (int)temp;
-            t0 = (t & p[i][0]) + p[i][1];
-            t1 = ((t >> 8) & p[i+1][0]) + p[i+1][1];
+            t = temp;
+            // p[i+...][0] is within 0..255 in this branch (small_flag==true),
+            // so we don't need to do (t>>...)&255,
+            // the upper bits will be cleaned with ... & p[i+...][0].
+            t0 = (t & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
+            t1 = ((t >> 8) & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             arr[i] = saturate_cast<T>(t0);
             arr[i+1] = saturate_cast<T>(t1);
 
-            t0 = ((t >> 16) & p[i+2][0]) + p[i+2][1];
-            t1 = ((t >> 24) & p[i+3][0]) + p[i+3][1];
+            t0 = ((t >> 16) & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
+            t1 = ((t >> 24) & p[k][0]) + p[k][1];
+            k = CN_NEXT(k);
             arr[i+2] = saturate_cast<T>(t0);
             arr[i+3] = saturate_cast<T>(t1);
         }
@@ -108,10 +130,11 @@ randBits_( T* arr, int len, uint64* state, const Vec2i* p, bool small_flag )
 
     for( ; i < len; i++ )
     {
-        int t0;
+        int64_t t0;
         temp = RNG_NEXT(temp);
 
-        t0 = ((int)temp & p[i][0]) + p[i][1];
+        t0 = ((int64_t)temp & p[k][0]) + p[k][1];
+        k = CN_NEXT(k);
         arr[i] = saturate_cast<T>(t0);
     }
 
@@ -123,101 +146,145 @@ struct DivStruct
     unsigned d;
     unsigned M;
     int sh1, sh2;
-    int delta;
+    int64_t delta;
+    uint64_t diff;
 };
 
 template<typename T> static void
-randi_( T* arr, int len, uint64* state, const DivStruct* p )
+randi_( T* arr, int len, int cn, uint64* state, const DivStruct* p )
 {
     uint64 temp = *state;
+    int k = 0;
+    len *= cn;
+    cn--;
     for( int i = 0; i < len; i++ )
     {
         temp = RNG_NEXT(temp);
         unsigned t = (unsigned)temp;
-        unsigned v = (unsigned)(((uint64)t * p[i].M) >> 32);
-        v = (v + ((t - v) >> p[i].sh1)) >> p[i].sh2;
-        v = t - v*p[i].d + p[i].delta;
-        arr[i] = saturate_cast<T>((int)v);
+        unsigned v = (unsigned)(((uint64)t * p[k].M) >> 32);
+        v = (v + ((t - v) >> p[k].sh1)) >> p[k].sh2;
+        int64_t res = (int64_t)(t - v*p[k].d) + p[k].delta;
+        k = CN_NEXT(k);
+        arr[i] = saturate_cast<T>(res);
     }
     *state = temp;
 }
 
-
-#define DEF_RANDI_FUNC(suffix, type) \
-static void randBits_##suffix(type* arr, int len, uint64* state, \
-                              const Vec2i* p, void*, bool small_flag) \
-{ randBits_(arr, len, state, p, small_flag); } \
-\
-static void randi_##suffix(type* arr, int len, uint64* state, \
-                           const DivStruct* p, void*, bool ) \
-{ randi_(arr, len, state, p); }
-
-DEF_RANDI_FUNC(8u, uchar)
-DEF_RANDI_FUNC(8s, schar)
-DEF_RANDI_FUNC(16u, ushort)
-DEF_RANDI_FUNC(16s, short)
-DEF_RANDI_FUNC(32s, int)
-
-static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, void*, bool )
-{
-    uint64 temp = *state;
-    for( int i = 0; i < len; i++ )
-    {
-        int t = (int)(temp = RNG_NEXT(temp));
-        arr[i] = (float)(t*p[i][0]);
-    }
-    *state = temp;
-
-    // add bias separately to make the generated random numbers
-    // more deterministic, independent of
-    // architecture details (FMA instruction use etc.)
-    hal::addRNGBias32f(arr, &p[0][0], len);
-}
-
 static void
-randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool )
+randi_( int64_t* arr, int len, int cn, uint64* state, const DivStruct* p )
 {
     uint64 temp = *state;
+    int k = 0;
+    len *= cn;
+    cn--;
     for( int i = 0; i < len; i++ )
     {
         temp = RNG_NEXT(temp);
-        int64 v = (temp >> 32)|(temp << 32);
-        arr[i] = v*p[i][0];
+        unsigned t0 = (unsigned)temp;
+        temp = RNG_NEXT(temp);
+        unsigned t1 = (unsigned)temp;
+        int64_t t = (int64_t)((((uint64_t)t0 << 32) | t1) % p[k].diff) + p[k].delta;
+        k = CN_NEXT(k);
+        arr[i] = t;
     }
     *state = temp;
-
-    hal::addRNGBias64f(arr, &p[0][0], len);
 }
 
-static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
+static void
+randi_( uint64_t* arr, int len, int cn, uint64* state, const DivStruct* p )
 {
     uint64 temp = *state;
+    int k = 0;
+    len *= cn;
+    cn--;
     for( int i = 0; i < len; i++ )
     {
-        float f = (float)(int)(temp = RNG_NEXT(temp));
-        fbuf[i] = f*p[i][0];
+        temp = RNG_NEXT(temp);
+        unsigned t0 = (unsigned)temp;
+        temp = RNG_NEXT(temp);
+        unsigned t1 = (unsigned)temp;
+        uint64_t t = (((uint64_t)t0 << 32) | t1) % p[k].diff;
+        int64_t delta = p[k].delta;
+        k = CN_NEXT(k);
+        arr[i] = delta >= 0 || t >= (uint64_t)-delta ? t + (uint64_t)delta : 0;
     }
     *state = temp;
-
-    // add bias separately to make the generated random numbers
-    // more deterministic, independent of
-    // architecture details (FMA instruction use etc.)
-    hal::addRNGBias32f(fbuf, &p[0][0], len);
-    hal::cvt32f16f(fbuf, arr, len);
 }
 
-typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag);
+#define DEF_RANDI_FUNC(suffix, type) \
+static void randBits_##suffix(type* arr, int len, int cn, uint64* state, \
+                              const Vec2l* p, void*, int flags) \
+{ randBits_(arr, len, cn, state, p, flags); } \
+\
+static void randi_##suffix(type* arr, int len, int cn, uint64* state, \
+                           const DivStruct* p, void*, int) \
+{ randi_(arr, len, cn, state, p); }
 
+DEF_RANDI_FUNC(8u, uchar)
+DEF_RANDI_FUNC(8b, bool)
+DEF_RANDI_FUNC(8s, schar)
+DEF_RANDI_FUNC(16u, ushort)
+DEF_RANDI_FUNC(16s, short)
+DEF_RANDI_FUNC(32u, unsigned)
+DEF_RANDI_FUNC(32s, int)
+DEF_RANDI_FUNC(64u, uint64_t)
+DEF_RANDI_FUNC(64s, int64_t)
 
-static RandFunc randTab[][8] =
+static void randf_16_or_32f( void* dst, int len_, int cn, uint64* state, const Vec2f* p, float* fbuf, int flags )
+{
+    int depth = CV_MAT_DEPTH(flags);
+    uint64 temp = *state;
+    int k = 0, len = len_*cn;
+    float* arr = depth == CV_16F || depth == CV_16BF ? fbuf : (float*)dst;
+    cn--;
+    for( int i = 0; i < len; i++ )
+    {
+        int t = (int)(temp = RNG_NEXT(temp));
+        arr[i] = (float)(t*p[k][0]);
+        k = CN_NEXT(k);
+    }
+    *state = temp;
+    hal::addRNGBias32f(arr, &p[0][0], len_, cn+1);
+    if (depth == CV_16F)
+        hal::cvt32f16f(fbuf, (float16_t*)dst, len);
+    else if (depth == CV_16BF)
+        hal::cvt32f16bf(fbuf, (bfloat16_t*)dst, len);
+}
+
+static void
+randf_64f( double* arr, int len_, int cn, uint64* state, const Vec2d* p, void*, int )
+{
+    uint64 temp = *state;
+    int k = 0, len = len_*cn;
+    cn--;
+    for( int i = 0; i < len; i++ )
+    {
+        temp = RNG_NEXT(temp);
+        int64_t v = (int64_t)((temp >> 32) | (temp << 32));
+        arr[i] = v*p[k][0];
+        k = CN_NEXT(k);
+    }
+    *state = temp;
+    hal::addRNGBias64f(arr, &p[0][0], len_, cn+1);
+}
+
+typedef void (*RandFunc)(uchar* arr, int len, int cn, uint64* state,
+                         const void* p, void* tempbuf, int flags);
+
+static RandFunc randTab[][16] =
 {
     {
-        (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s,
-        (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, (RandFunc)randf_16f
+        (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u,
+        (RandFunc)randi_16s, (RandFunc)randi_32s, (RandFunc)randf_16_or_32f,
+        (RandFunc)randf_64f, (RandFunc)randf_16_or_32f, (RandFunc)randf_16_or_32f,
+        (RandFunc)randi_8b, (RandFunc)randi_64u, (RandFunc)randi_64s,
+        (RandFunc)randi_32u, 0, 0, 0
     },
     {
-        (RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u, (RandFunc)randBits_16s,
-        (RandFunc)randBits_32s, 0, 0, 0
+        (RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u,
+        (RandFunc)randBits_16s, (RandFunc)randBits_32s, 0, 0, 0, 0,
+        (RandFunc)randBits_8b, (RandFunc)randBits_64u, (RandFunc)randBits_64s,
+        (RandFunc)randBits_32u, 0, 0, 0
     }
 };
 
@@ -309,90 +376,153 @@ double RNG::gaussian(double sigma)
     return temp*sigma;
 }
 
-
 template<typename T, typename PT> static void
-randnScale_( const float* src, T* dst, int len, int cn, const PT* mean, const PT* stddev, bool stdmtx )
+randnScale_(float* src, T* dst, int len, int cn,
+            const PT* mean, const PT* stddev, int flags )
 {
+    bool stdmtx = (flags & RNG_FLAG_STDMTX) != 0;
     int i, j, k;
-    if( !stdmtx )
+    if( !stdmtx || cn == 1 )
     {
         if( cn == 1 )
         {
-            PT b = mean[0], a = stddev[0];
+            PT a = stddev[0], b = mean[0];
             for( i = 0; i < len; i++ )
                 dst[i] = saturate_cast<T>(src[i]*a + b);
         }
         else
         {
-            for( i = 0; i < len; i++, src += cn, dst += cn )
-                for( k = 0; k < cn; k++ )
-                    dst[k] = saturate_cast<T>(src[k]*stddev[k] + mean[k]);
+            len *= cn;
+            cn--;
+            for( i = k = 0; i < len; i++ ) {
+                dst[i] = saturate_cast<T>(src[i]*stddev[k] + mean[k]);
+                k = CN_NEXT(k);
+            }
         }
     }
     else
     {
-        for( i = 0; i < len; i++, src += cn, dst += cn )
+        len *= cn;
+        cn--;
+        for( i = j = 0; i < len; i++ )
         {
-            for( j = 0; j < cn; j++ )
-            {
-                PT s = mean[j];
-                for( k = 0; k < cn; k++ )
-                    s += src[k]*stddev[j*cn + k];
-                dst[j] = saturate_cast<T>(s);
-            }
+            PT s = mean[j];
+            int i0 = i - j;
+            for( k = 0; k <= cn; k++ )
+                s += src[i0 + k]*stddev[j*(cn+1) + k];
+            dst[i] = saturate_cast<T>(s);
+            j = CN_NEXT(j);
         }
     }
 }
 
-static void randnScale_8u( const float* src, uchar* dst, int len, int cn,
-                            const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
+// special version for 16f, 16bf and 32f
+static void
+randnScale_16_or_32f(float* fbuf, float* dst, int len, int cn,
+                     const float* mean, const float* stddev, int flags)
+{
+    bool stdmtx = (flags & RNG_FLAG_STDMTX) != 0;
+    int depth = CV_MAT_DEPTH(flags);
+    float* arr = depth == CV_16F || depth == CV_16BF ? fbuf : dst;
+    int i, j, k;
 
-static void randnScale_8s( const float* src, schar* dst, int len, int cn,
-                            const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
+    if( !stdmtx || cn == 1 )
+    {
+        if( cn == 1 )
+        {
+            float a = stddev[0], b = mean[0];
+            for( i = 0; i < len; i++ )
+                arr[i] = fbuf[i]*a + b;
+        }
+        else
+        {
+            len *= cn;
+            cn--;
+            for( i = k = 0; i < len; i++ ) {
+                arr[i] = fbuf[i]*stddev[k] + mean[k];
+                k = CN_NEXT(k);
+            }
+        }
+    }
+    else if( depth == CV_32F )
+    {
+        len *= cn;
+        cn--;
+        for( i = j = 0; i < len; i++ )
+        {
+            float s = mean[j];
+            int i0 = i - j;
+            for( k = 0; k <= cn; k++ )
+                s += fbuf[i0 + k]*stddev[j*(cn+1) + k];
+            dst[i] = s;
+            j = CN_NEXT(j);
+        }
+    }
+    else
+    {
+        float elembuf[CV_CN_MAX];
+        len *= cn;
+        for( i = 0; i < len; i += cn )
+        {
+            // since we process fbuf in-place,
+            // we need to copy each cn-channel element
+            // prior to matrix multiplication
+            for (j = 0; j < cn; j++)
+                elembuf[j] = fbuf[i + j];
+            for (j = 0; j < cn; j++) {
+                float s = mean[j];
+                for( k = 0; k < cn; k++ )
+                    s += elembuf[k]*stddev[j*cn + k];
+                fbuf[i + j] = s;
+            }
+        }
+    }
+    if (depth == CV_16F)
+        hal::cvt32f16f(fbuf, (float16_t*)dst, len);
+    else if (depth == CV_16BF)
+        hal::cvt32f16bf(fbuf, (bfloat16_t*)dst, len);
+}
 
-static void randnScale_16u( const float* src, ushort* dst, int len, int cn,
-                             const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
+#define DEF_RANDNSCALE_FUNC(suffix, T, PT) \
+static void randnScale_##suffix( float* src, T* dst, int len, int cn, \
+                                 const PT* mean, const PT* stddev, int flags ) \
+{ randnScale_(src, dst, len, cn, mean, stddev, flags); }
 
-static void randnScale_16s( const float* src, short* dst, int len, int cn,
-                             const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
+DEF_RANDNSCALE_FUNC(8u, uchar, float)
+DEF_RANDNSCALE_FUNC(8b, bool, float)
+DEF_RANDNSCALE_FUNC(8s, schar, float)
+DEF_RANDNSCALE_FUNC(16u, ushort, float)
+DEF_RANDNSCALE_FUNC(16s, short, float)
+DEF_RANDNSCALE_FUNC(32u, unsigned, float)
+DEF_RANDNSCALE_FUNC(32s, int, float)
+DEF_RANDNSCALE_FUNC(64u, uint64_t, double)
+DEF_RANDNSCALE_FUNC(64s, int64_t, double)
+DEF_RANDNSCALE_FUNC(64f, double, double)
 
-static void randnScale_32s( const float* src, int* dst, int len, int cn,
-                             const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
-
-static void randnScale_32f( const float* src, float* dst, int len, int cn,
-                             const float* mean, const float* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
-
-static void randnScale_64f( const float* src, double* dst, int len, int cn,
-                             const double* mean, const double* stddev, bool stdmtx )
-{ randnScale_(src, dst, len, cn, mean, stddev, stdmtx); }
-
-typedef void (*RandnScaleFunc)(const float* src, uchar* dst, int len, int cn,
-                               const uchar*, const uchar*, bool);
+typedef void (*RandnScaleFunc)(float* src, void* dst, int len, int cn,
+                               const void* mean, const void* stddev, int flags);
 
 static RandnScaleFunc randnScaleTab[] =
 {
     (RandnScaleFunc)randnScale_8u, (RandnScaleFunc)randnScale_8s, (RandnScaleFunc)randnScale_16u,
-    (RandnScaleFunc)randnScale_16s, (RandnScaleFunc)randnScale_32s, (RandnScaleFunc)randnScale_32f,
-    (RandnScaleFunc)randnScale_64f, 0
+    (RandnScaleFunc)randnScale_16s, (RandnScaleFunc)randnScale_32s, (RandnScaleFunc)randnScale_16_or_32f,
+    (RandnScaleFunc)randnScale_64f, (RandnScaleFunc)randnScale_16_or_32f, (RandnScaleFunc)randnScale_16_or_32f,
+    (RandnScaleFunc)randnScale_8b, (RandnScaleFunc)randnScale_64u, (RandnScaleFunc)randnScale_64s,
+    (RandnScaleFunc)randnScale_32u, 0, 0, 0
 };
 
 void RNG::fill( InputOutputArray _mat, int disttype,
-                InputArray _param1arg, InputArray _param2arg, bool saturateRange )
+                InputArray _param1arg, InputArray _param2arg,
+                bool saturateRange )
 {
     CV_Assert(!_mat.empty());
 
     Mat mat = _mat.getMat(), _param1 = _param1arg.getMat(), _param2 = _param2arg.getMat();
-    int depth = mat.depth(), cn = mat.channels();
+    int j, depth = mat.depth(), cn = mat.channels();
+    int esz1 = CV_ELEM_SIZE(depth);
     AutoBuffer<double> _parambuf;
-    int j, k;
     bool fast_int_mode = false;
-    bool smallFlag = true;
+    bool small_flag = false;
     RandFunc func = 0;
     RandnScaleFunc scaleFunc = 0;
 
@@ -405,10 +535,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
                 (_param1.size() == Size(1, 4) && _param1.type() == CV_64F && cn <= 4))) ||
                 (_param2.rows == cn && _param2.cols == cn && disttype == NORMAL)));
 
-    Vec2i* ip = 0;
-    Vec2d* dp = 0;
-    Vec2f* fp = 0;
-    DivStruct* ds = 0;
+    const void* uni_param = 0;
     uchar* mean = 0;
     uchar* stddev = 0;
     bool stdmtx = false;
@@ -417,47 +544,48 @@ void RNG::fill( InputOutputArray _mat, int disttype,
 
     if( disttype == UNIFORM )
     {
-        _parambuf.allocate(cn*8 + n1 + n2);
+        _parambuf.allocate((sizeof(DivStruct)+sizeof(double)-1)/sizeof(double) + cn*2 + n1 + n2);
         double* parambuf = _parambuf.data();
         double* p1 = _param1.ptr<double>();
         double* p2 = _param2.ptr<double>();
 
         if( !_param1.isContinuous() || _param1.type() != CV_64F || n1 != cn )
         {
-            Mat tmp(_param1.size(), CV_64F, parambuf);
-            _param1.convertTo(tmp, CV_64F);
             p1 = parambuf;
-            if( n1 < cn )
-                for( j = n1; j < cn; j++ )
-                    p1[j] = p1[j-n1];
+            Mat tmp(_param1.size(), CV_64F, p1);
+            _param1.convertTo(tmp, CV_64F);
+            for( j = n1; j < cn; j++ )
+                p1[j] = p1[j-n1];
         }
 
         if( !_param2.isContinuous() || _param2.type() != CV_64F || n2 != cn )
         {
-            Mat tmp(_param2.size(), CV_64F, parambuf + cn);
-            _param2.convertTo(tmp, CV_64F);
             p2 = parambuf + cn;
-            if( n2 < cn )
-                for( j = n2; j < cn; j++ )
-                    p2[j] = p2[j-n2];
+            Mat tmp(_param2.size(), CV_64F, p2);
+            _param2.convertTo(tmp, CV_64F);
+            for( j = n2; j < cn; j++ )
+                p2[j] = p2[j-n2];
         }
 
-        if( depth <= CV_32S )
+        if( CV_IS_INT_TYPE(depth) )
         {
-            ip = (Vec2i*)(parambuf + cn*2);
+            Vec2l* ip = (Vec2l*)(parambuf + cn*2);
             for( j = 0, fast_int_mode = true; j < cn; j++ )
             {
                 double a = std::min(p1[j], p2[j]);
                 double b = std::max(p1[j], p2[j]);
                 if( saturateRange )
                 {
-                    a = std::max(a, depth == CV_8U || depth == CV_16U ? 0. :
-                            depth == CV_8S ? -128. : depth == CV_16S ? -32768. : (double)INT_MIN);
-                    b = std::min(b, depth == CV_8U ? 256. : depth == CV_16U ? 65536. :
-                            depth == CV_8S ? 128. : depth == CV_16S ? 32768. : (double)INT_MAX);
+                    a = std::max(a, depth == CV_8U || depth == CV_16U || depth == CV_32U ||
+                                 depth == CV_64U || depth == CV_Bool ? 0. :
+                                 depth == CV_8S ? -128. : depth == CV_16S ? -32768. :
+                                 depth == CV_32S ? (double)INT_MIN : (double)INT64_MIN);
+                    b = std::min(b, depth == CV_8U ? 256. : depth == CV_Bool ? 2. : depth == CV_16U ? 65536. :
+                                 depth == CV_8S ? 128. : depth == CV_16S ? 32768. : depth == CV_32U ? (double)UINT_MAX :
+                                 depth == CV_32S ? (double)INT_MAX : (double)INT64_MAX);
                 }
-                ip[j][1] = cvCeil(a);
-                int idiff = ip[j][0] = cvFloor(b) - ip[j][1] - 1;
+                ip[j][1] = (int64_t)ceil(a);
+                int64_t idiff = ip[j][0] = (int64_t)floor(b) - ip[j][1] - 1;
                 if (idiff < 0)
                 {
                     idiff = 0;
@@ -467,30 +595,41 @@ void RNG::fill( InputOutputArray _mat, int disttype,
 
                 fast_int_mode = fast_int_mode && diff <= 4294967296. && (idiff & (idiff+1)) == 0;
                 if( fast_int_mode )
-                    smallFlag = smallFlag && (idiff <= 255);
+                    small_flag = idiff <= 255;
                 else
                 {
-                    if( diff > INT_MAX )
-                        ip[j][0] = INT_MAX;
-                    if( a < INT_MIN/2 )
-                        ip[j][1] = INT_MIN/2;
+                    int64_t minval = INT32_MIN/2, maxval = INT32_MAX;
+                    if (depth == CV_64S || depth == CV_64U)
+                    {
+                        minval = INT64_MIN/2;
+                        maxval = INT64_MAX;
+                    }
+                    if( diff > (double)maxval )
+                        ip[j][0] = maxval;
+                    if( a < (double)minval )
+                        ip[j][1] = minval;
                 }
             }
 
+            uni_param = ip;
             if( !fast_int_mode )
             {
-                ds = (DivStruct*)(ip + cn);
+                DivStruct* ds = (DivStruct*)(ip + cn);
                 for( j = 0; j < cn; j++ )
                 {
                     ds[j].delta = ip[j][1];
-                    unsigned d = ds[j].d = (unsigned)(ip[j][0]+1);
-                    int l = 0;
-                    while(((uint64)1 << l) < d)
-                        l++;
-                    ds[j].M = (unsigned)(((uint64)1 << 32)*(((uint64)1 << l) - d)/d) + 1;
-                    ds[j].sh1 = std::min(l, 1);
-                    ds[j].sh2 = std::max(l - 1, 0);
+                    ds[j].diff = ip[j][0];
+                    if (depth != CV_64U && depth != CV_64S) {
+                        unsigned d = ds[j].d = (unsigned)(ip[j][0]+1);
+                        int l = 0;
+                        while(((uint64)1 << l) < d)
+                            l++;
+                        ds[j].M = (unsigned)(((uint64)1 << 32)*(((uint64)1 << l) - d)/d) + 1;
+                        ds[j].sh1 = std::min(l, 1);
+                        ds[j].sh2 = std::max(l - 1, 0);
+                    }
                 }
+                uni_param = ds;
             }
 
             func = randTab[fast_int_mode ? 1 : 0][depth];
@@ -508,21 +647,23 @@ void RNG::fill( InputOutputArray _mat, int disttype,
             // dparam[0][i]*X + dparam[1][i]
             if( depth != CV_64F )
             {
-                fp = (Vec2f*)(parambuf + cn*2);
+                Vec2f* fp = (Vec2f*)(parambuf + cn*2);
                 for( j = 0; j < cn; j++ )
                 {
                     fp[j][0] = (float)(std::min(maxdiff, p2[j] - p1[j])*scale);
                     fp[j][1] = (float)((p2[j] + p1[j])*0.5);
                 }
+                uni_param = fp;
             }
             else
             {
-                dp = (Vec2d*)(parambuf + cn*2);
+                Vec2d* dp = (Vec2d*)(parambuf + cn*2);
                 for( j = 0; j < cn; j++ )
                 {
                     dp[j][0] = std::min(DBL_MAX, p2[j] - p1[j])*scale;
                     dp[j][1] = ((p2[j] + p1[j])*0.5);
                 }
+                uni_param = dp;
             }
 
             func = randTab[0][depth];
@@ -534,8 +675,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         _parambuf.allocate(MAX(n1, cn) + MAX(n2, cn));
         double* parambuf = _parambuf.data();
 
-        int ptype = depth == CV_64F ? CV_64F : CV_32F;
-        int esz = (int)CV_ELEM_SIZE(ptype);
+        int ptype = esz1 == 8 ? CV_64F : CV_32F;
 
         if( _param1.isContinuous() && _param1.type() == ptype && n1 >= cn)
             mean = _param1.ptr();
@@ -547,8 +687,8 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         }
 
         if( n1 < cn )
-            for( j = n1*esz; j < cn*esz; j++ )
-                mean[j] = mean[j - n1*esz];
+            for( j = n1*esz1; j < cn*esz1; j++ )
+                mean[j] = mean[j - n1*esz1];
 
         if( _param2.isContinuous() && _param2.type() == ptype && n2 >= cn)
             stddev = _param2.ptr();
@@ -560,8 +700,8 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         }
 
         if( n2 < cn )
-            for( j = n2*esz; j < cn*esz; j++ )
-                stddev[j] = stddev[j - n2*esz];
+            for( j = n2*esz1; j < cn*esz1; j++ )
+                stddev[j] = stddev[j - n2*esz1];
 
         stdmtx = _param2.rows == cn && _param2.cols == cn;
         scaleFunc = randnScaleTab[depth];
@@ -571,59 +711,18 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         CV_Error( CV_StsBadArg, "Unknown distribution type" );
 
     const Mat* arrays[] = {&mat, 0};
-    uchar* ptr;
+    uchar* ptr = 0;
     NAryMatIterator it(arrays, &ptr, 1);
-    int total = (int)it.size, blockSize = std::min((BLOCK_SIZE + cn - 1)/cn, total);
-    size_t esz = mat.elemSize();
-    AutoBuffer<double> buf;
-    uchar* param = 0;
-    float* nbuf = 0;
-    float* tmpbuf = 0;
+    float fbuf[BLOCK_SIZE + CV_CN_MAX];
+    int total = (int)it.size;
+    int blockSize = std::min((BLOCK_SIZE + cn - 1)/cn, total);
+    size_t esz = (size_t)esz1*cn;
+    int flags = mat.type();
 
     if( disttype == UNIFORM )
-    {
-        buf.allocate(blockSize*cn*4);
-        param = (uchar*)(double*)buf.data();
-
-        if( depth <= CV_32S )
-        {
-            if( !fast_int_mode )
-            {
-                DivStruct* p = (DivStruct*)param;
-                for( j = 0; j < blockSize*cn; j += cn )
-                    for( k = 0; k < cn; k++ )
-                        p[j + k] = ds[k];
-            }
-            else
-            {
-                Vec2i* p = (Vec2i*)param;
-                for( j = 0; j < blockSize*cn; j += cn )
-                    for( k = 0; k < cn; k++ )
-                        p[j + k] = ip[k];
-            }
-        }
-        else if( depth != CV_64F )
-        {
-            Vec2f* p = (Vec2f*)param;
-            for( j = 0; j < blockSize*cn; j += cn )
-                for( k = 0; k < cn; k++ )
-                    p[j + k] = fp[k];
-            if( depth == CV_16F )
-                tmpbuf = (float*)p + blockSize*cn*2;
-        }
-        else
-        {
-            Vec2d* p = (Vec2d*)param;
-            for( j = 0; j < blockSize*cn; j += cn )
-                for( k = 0; k < cn; k++ )
-                    p[j + k] = dp[k];
-        }
-    }
+        flags |= (small_flag ? (int)RNG_FLAG_SMALL : 0);
     else
-    {
-        buf.allocate((blockSize*cn+1)/2);
-        nbuf = (float*)(double*)buf.data();
-    }
+        flags |= (stdmtx ? (int)RNG_FLAG_STDMTX : 0);
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
@@ -631,14 +730,13 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         {
             int len = std::min(total - j, blockSize);
 
-            if( disttype == CV_RAND_UNI )
-                func( ptr, len*cn, &state, param, tmpbuf, smallFlag );
+            if( disttype == UNIFORM )
+                func(ptr + j*esz, len, cn, &state, uni_param, fbuf, flags);
             else
             {
-                randn_0_1_32f(nbuf, len*cn, &state);
-                scaleFunc(nbuf, ptr, len, cn, mean, stddev, stdmtx);
+                randn_0_1_32f(fbuf, len*cn, &state);
+                scaleFunc(fbuf, ptr + j*esz, len, cn, mean, stddev, flags);
             }
-            ptr += len*esz;
         }
     }
 }
diff --git a/modules/core/src/split.dispatch.cpp b/modules/core/src/split.dispatch.cpp
index fc5e073497..42a07ed2e3 100644
--- a/modules/core/src/split.dispatch.cpp
+++ b/modules/core/src/split.dispatch.cpp
@@ -53,12 +53,15 @@ typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
 
 static SplitFunc getSplitFunc(int depth)
 {
-    static SplitFunc splitTab[] =
+    static SplitFunc splitTab[CV_DEPTH_MAX] =
     {
         (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
         (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
         (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s),
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u)
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), 0, 0, 0
     };
 
     return splitTab[depth];
diff --git a/modules/core/src/sum.simd.hpp b/modules/core/src/sum.simd.hpp
index 2232013b24..045f40ebed 100644
--- a/modules/core/src/sum.simd.hpp
+++ b/modules/core/src/sum.simd.hpp
@@ -434,7 +434,7 @@ static int sum64f( const double* src, const uchar* mask, double* dst, int len, i
 
 SumFunc getSumFunc(int depth)
 {
-    static SumFunc sumTab[] =
+    static SumFunc sumTab[CV_DEPTH_MAX] =
     {
         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
         (SumFunc)sum16u, (SumFunc)sum16s,
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index ea9cda56be..c6756f6502 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -40,7 +40,11 @@ struct BaseElemWiseOp
                                   ninputs > 1 ? ARITHM_MAX_CHANNELS : 4);
     }
 
-    virtual double getMaxErr(int depth) { return depth < CV_32F ? 1 : depth == CV_32F ? 1e-5 : 1e-12; }
+    virtual double getMaxErr(int depth)
+    {
+        return depth < CV_32F || depth == CV_32U || depth == CV_64U || depth == CV_64S ? 1 :
+               depth == CV_16F || depth == CV_16BF ? 1e-2 : depth == CV_32F ? 1e-5 : 1e-12;
+    }
     virtual void generateScalars(int depth, RNG& rng)
     {
         const double m = 3.;
@@ -93,11 +97,31 @@ struct BaseElemWiseOp
     int context;
 };
 
+static const _OutputArray::DepthMask baseArithmTypeMask =
+    _OutputArray::DepthMask(
+        _OutputArray::DEPTH_MASK_8U |
+        _OutputArray::DEPTH_MASK_16U |
+        _OutputArray::DEPTH_MASK_16S |
+        _OutputArray::DEPTH_MASK_32S |
+        _OutputArray::DEPTH_MASK_32F |
+        _OutputArray::DEPTH_MASK_64F);
 
-struct BaseAddOp : public BaseElemWiseOp
+struct BaseArithmOp : public BaseElemWiseOp
+{
+    BaseArithmOp(int _ninputs, int _flags, double _alpha, double _beta, Scalar _gamma=Scalar::all(0))
+    : BaseElemWiseOp(_ninputs, _flags, _alpha, _beta, _gamma) {}
+
+    int getRandomType(RNG& rng)
+    {
+        return cvtest::randomType(rng, baseArithmTypeMask, 1,
+                                  ninputs > 1 ? ARITHM_MAX_CHANNELS : 4);
+    }
+};
+
+struct BaseAddOp : public BaseArithmOp
 {
     BaseAddOp(int _ninputs, int _flags, double _alpha, double _beta, Scalar _gamma=Scalar::all(0))
-    : BaseElemWiseOp(_ninputs, _flags, _alpha, _beta, _gamma) {}
+    : BaseArithmOp(_ninputs, _flags, _alpha, _beta, _gamma) {}
 
     void refop(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
@@ -192,9 +216,9 @@ struct AddWeightedOp : public BaseAddOp
     }
 };
 
-struct MulOp : public BaseElemWiseOp
+struct MulOp : public BaseArithmOp
 {
-    MulOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    MulOp() : BaseArithmOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void getValueRange(int depth, double& minval, double& maxval)
     {
         minval = depth < CV_32S ? cvtest::getMinVal(depth) : depth == CV_32S ? -1000000 : -1000.;
@@ -216,9 +240,9 @@ struct MulOp : public BaseElemWiseOp
     }
 };
 
-struct DivOp : public BaseElemWiseOp
+struct DivOp : public BaseArithmOp
 {
-    DivOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    DivOp() : BaseArithmOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::divide(src[0], src[1], dst, alpha);
@@ -233,9 +257,9 @@ struct DivOp : public BaseElemWiseOp
     }
 };
 
-struct RecipOp : public BaseElemWiseOp
+struct RecipOp : public BaseArithmOp
 {
-    RecipOp() : BaseElemWiseOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    RecipOp() : BaseArithmOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::divide(alpha, src[0], dst);
@@ -339,9 +363,9 @@ struct LogicSOp : public BaseElemWiseOp
     char opcode;
 };
 
-struct MinOp : public BaseElemWiseOp
+struct MinOp : public BaseArithmOp
 {
-    MinOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    MinOp() : BaseArithmOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::min(src[0], src[1], dst);
@@ -356,9 +380,9 @@ struct MinOp : public BaseElemWiseOp
     }
 };
 
-struct MaxOp : public BaseElemWiseOp
+struct MaxOp : public BaseArithmOp
 {
-    MaxOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    MaxOp() : BaseArithmOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::max(src[0], src[1], dst);
@@ -373,9 +397,9 @@ struct MaxOp : public BaseElemWiseOp
     }
 };
 
-struct MinSOp : public BaseElemWiseOp
+struct MinSOp : public BaseArithmOp
 {
-    MinSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
+    MinSOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::min(src[0], gamma[0], dst);
@@ -390,9 +414,9 @@ struct MinSOp : public BaseElemWiseOp
     }
 };
 
-struct MaxSOp : public BaseElemWiseOp
+struct MaxSOp : public BaseArithmOp
 {
-    MaxSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
+    MaxSOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::max(src[0], gamma[0], dst);
@@ -407,9 +431,9 @@ struct MaxSOp : public BaseElemWiseOp
     }
 };
 
-struct CmpOp : public BaseElemWiseOp
+struct CmpOp : public BaseArithmOp
 {
-    CmpOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
+    CmpOp() : BaseArithmOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
     void generateScalars(int depth, RNG& rng)
     {
         BaseElemWiseOp::generateScalars(depth, rng);
@@ -425,7 +449,7 @@ struct CmpOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
+        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
     }
 
     double getMaxErr(int)
@@ -435,9 +459,9 @@ struct CmpOp : public BaseElemWiseOp
     int cmpop;
 };
 
-struct CmpSOp : public BaseElemWiseOp
+struct CmpSOp : public BaseArithmOp
 {
-    CmpSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
+    CmpSOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
     void generateScalars(int depth, RNG& rng)
     {
         BaseElemWiseOp::generateScalars(depth, rng);
@@ -455,7 +479,7 @@ struct CmpSOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
+        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
     }
     double getMaxErr(int)
     {
@@ -478,7 +502,7 @@ struct CopyOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
     }
     double getMaxErr(int)
     {
@@ -500,7 +524,7 @@ struct SetOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
     }
     double getMaxErr(int)
     {
@@ -650,9 +674,9 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
 } // namespace
 CVTEST_GUARD_SYMBOL(inRange);
 
-struct InRangeSOp : public BaseElemWiseOp
+struct InRangeSOp : public BaseArithmOp
 {
-    InRangeSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {}
+    InRangeSOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::inRange(src[0], gamma, gamma1, dst);
@@ -680,9 +704,9 @@ struct InRangeSOp : public BaseElemWiseOp
 };
 
 
-struct InRangeOp : public BaseElemWiseOp
+struct InRangeOp : public BaseArithmOp
 {
-    InRangeOp() : BaseElemWiseOp(3, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    InRangeOp() : BaseArithmOp(3, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         Mat lb, rb;
@@ -725,7 +749,7 @@ struct ConvertScaleOp : public BaseElemWiseOp
     }
     double getMaxErr(int)
     {
-        return ddepth <= CV_32S ? 2 : ddepth < CV_64F ? 1e-3 : 1e-12;
+        return ddepth <= CV_32S || ddepth == CV_32U || ddepth == CV_64U || ddepth == CV_64S ? 2 : ddepth == CV_64F ? 1e-12 : ddepth == CV_Bool ? 0 : ddepth == CV_16BF ? 1e-2 : 2e-3;
     }
     void generateScalars(int depth, RNG& rng)
     {
@@ -1018,9 +1042,9 @@ static void log(const Mat& src, Mat& dst)
 
 } // namespace
 
-struct ExpOp : public BaseElemWiseOp
+struct ExpOp : public BaseArithmOp
 {
-    ExpOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    ExpOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, ARITHM_MAX_CHANNELS);
@@ -1045,9 +1069,9 @@ struct ExpOp : public BaseElemWiseOp
 };
 
 
-struct LogOp : public BaseElemWiseOp
+struct LogOp : public BaseArithmOp
 {
-    LogOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    LogOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, ARITHM_MAX_CHANNELS);
@@ -1129,9 +1153,9 @@ static void cartToPolar(const Mat& mx, const Mat& my, Mat& mmag, Mat& mangle, bo
 
 } // namespace
 
-struct CartToPolarToCartOp : public BaseElemWiseOp
+struct CartToPolarToCartOp : public BaseArithmOp
 {
-    CartToPolarToCartOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0))
+    CartToPolarToCartOp() : BaseArithmOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0))
     {
         context = 3;
         angleInDegrees = true;
@@ -1173,9 +1197,9 @@ struct CartToPolarToCartOp : public BaseElemWiseOp
 };
 
 
-struct MeanOp : public BaseElemWiseOp
+struct MeanOp : public BaseArithmOp
 {
-    MeanOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
+    MeanOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
     };
@@ -1196,9 +1220,9 @@ struct MeanOp : public BaseElemWiseOp
 };
 
 
-struct SumOp : public BaseElemWiseOp
+struct SumOp : public BaseArithmOp
 {
-    SumOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
+    SumOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
     };
@@ -1219,13 +1243,13 @@ struct SumOp : public BaseElemWiseOp
 };
 
 
-struct CountNonZeroOp : public BaseElemWiseOp
+struct CountNonZeroOp : public BaseArithmOp
 {
-    CountNonZeroOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT+SUPPORT_MASK, 1, 1, Scalar::all(0))
+    CountNonZeroOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT+SUPPORT_MASK, 1, 1, Scalar::all(0))
     {}
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, 1);
+        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
     }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
@@ -1252,12 +1276,12 @@ struct CountNonZeroOp : public BaseElemWiseOp
 };
 
 
-struct MeanStdDevOp : public BaseElemWiseOp
+struct MeanStdDevOp : public BaseArithmOp
 {
     Scalar sqmeanRef;
     int cn;
 
-    MeanStdDevOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
+    MeanStdDevOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         cn = 0;
         context = 7;
@@ -1296,16 +1320,16 @@ struct MeanStdDevOp : public BaseElemWiseOp
 };
 
 
-struct NormOp : public BaseElemWiseOp
+struct NormOp : public BaseArithmOp
 {
-    NormOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
+    NormOp() : BaseArithmOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 1;
         normType = 0;
     };
     int getRandomType(RNG& rng)
     {
-        int type = cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 4);
+        int type = cvtest::randomType(rng, baseArithmTypeMask, 1, 4);
         for(;;)
         {
             normType = rng.uniform(1, 8);
@@ -1343,15 +1367,15 @@ struct NormOp : public BaseElemWiseOp
 };
 
 
-struct MinMaxLocOp : public BaseElemWiseOp
+struct MinMaxLocOp : public BaseArithmOp
 {
-    MinMaxLocOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
+    MinMaxLocOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
     };
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
+        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
     }
     void saveOutput(const vector<int>& minidx, const vector<int>& maxidx,
                     double minval, double maxval, Mat& dst)
@@ -1389,16 +1413,16 @@ struct MinMaxLocOp : public BaseElemWiseOp
     }
 };
 
-struct reduceArgMinMaxOp : public BaseElemWiseOp
+struct reduceArgMinMaxOp : public BaseArithmOp
 {
-    reduceArgMinMaxOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)),
+    reduceArgMinMaxOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)),
                           isLast(false), isMax(false), axis(0)
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
     };
     int getRandomType(RNG& rng) override
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
+        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
     }
     void getRandomSize(RNG& rng, vector<int>& size) override
     {
@@ -1568,82 +1592,82 @@ INSTANTIATE_TEST_CASE_P(Core_CartToPolarToCart, ElemWiseTest, ::testing::Values(
 
 TEST(Core_ArithmMask, uninitialized)
 {
-            RNG& rng = theRNG();
-            const int MAX_DIM=3;
-            int sizes[MAX_DIM];
-            for( int iter = 0; iter < 100; iter++ )
-            {
-                int dims = rng.uniform(1, MAX_DIM+1);
-                int depth = rng.uniform(CV_8U, CV_64F+1);
-                int cn = rng.uniform(1, 6);
-                int type = CV_MAKETYPE(depth, cn);
-                int op = rng.uniform(0, depth < CV_32F ? 5 : 2); // don't run binary operations between floating-point values
-                int depth1 = op <= 1 ? CV_64F : depth;
-                for (int k = 0; k < MAX_DIM; k++)
-                {
-                    sizes[k] = k < dims ? rng.uniform(1, 30) : 0;
-                }
-                SCOPED_TRACE(cv::format("iter=%d dims=%d depth=%d cn=%d type=%d op=%d depth1=%d dims=[%d; %d; %d]",
-                                         iter,   dims,   depth,   cn,   type,   op,   depth1, sizes[0], sizes[1], sizes[2]));
+    RNG& rng = theRNG();
+    const int MAX_DIM=3;
+    int sizes[MAX_DIM];
+    for( int iter = 0; iter < 100; iter++ )
+    {
+        int dims = rng.uniform(1, MAX_DIM+1);
+        int depth = rng.uniform(CV_8U, CV_64F+1);
+        int cn = rng.uniform(1, 6);
+        int type = CV_MAKETYPE(depth, cn);
+        int op = rng.uniform(0, depth < CV_32F ? 5 : 2); // don't run binary operations between floating-point values
+        int depth1 = op <= 1 ? CV_64F : depth;
+        for (int k = 0; k < MAX_DIM; k++)
+        {
+            sizes[k] = k < dims ? rng.uniform(1, 30) : 0;
+        }
+        SCOPED_TRACE(cv::format("iter=%d dims=%d depth=%d cn=%d type=%d op=%d depth1=%d dims=[%d; %d; %d]",
+                                 iter,   dims,   depth,   cn,   type,   op,   depth1, sizes[0], sizes[1], sizes[2]));
 
-                Mat a(dims, sizes, type), a1;
-                Mat b(dims, sizes, type), b1;
-                Mat mask(dims, sizes, CV_8U);
-                Mat mask1;
-                Mat c, d;
+        Mat a(dims, sizes, type), a1;
+        Mat b(dims, sizes, type), b1;
+        Mat mask(dims, sizes, CV_8U);
+        Mat mask1;
+        Mat c, d;
 
-                rng.fill(a, RNG::UNIFORM, 0, 100);
-                rng.fill(b, RNG::UNIFORM, 0, 100);
+        rng.fill(a, RNG::UNIFORM, 0, 100);
+        rng.fill(b, RNG::UNIFORM, 0, 100);
 
-                // [-2,2) range means that the each generated random number
-                // will be one of -2, -1, 0, 1. Saturated to [0,255], it will become
-                // 0, 0, 0, 1 => the mask will be filled by ~25%.
-                rng.fill(mask, RNG::UNIFORM, -2, 2);
+        // [-2,2) range means that the each generated random number
+        // will be one of -2, -1, 0, 1. Saturated to [0,255], it will become
+        // 0, 0, 0, 1 => the mask will be filled by ~25%.
+        rng.fill(mask, RNG::UNIFORM, -2, 2);
 
-                a.convertTo(a1, depth1);
-                b.convertTo(b1, depth1);
-                // invert the mask
-                cv::compare(mask, 0, mask1, CMP_EQ);
-                a1.setTo(0, mask1);
-                b1.setTo(0, mask1);
+        a.convertTo(a1, depth1);
+        b.convertTo(b1, depth1);
+        // invert the mask
+        cv::compare(mask, 0, mask1, CMP_EQ);
+        a1.setTo(0, mask1);
+        b1.setTo(0, mask1);
 
-                if( op == 0 )
-                {
-                    cv::add(a, b, c, mask);
-                    cv::add(a1, b1, d);
-                }
-                else if( op == 1 )
-                {
-                    cv::subtract(a, b, c, mask);
-                    cv::subtract(a1, b1, d);
-                }
-                else if( op == 2 )
-                {
-                    cv::bitwise_and(a, b, c, mask);
-                    cv::bitwise_and(a1, b1, d);
-                }
-                else if( op == 3 )
-                {
-                    cv::bitwise_or(a, b, c, mask);
-                    cv::bitwise_or(a1, b1, d);
-                }
-                else if( op == 4 )
-                {
-                    cv::bitwise_xor(a, b, c, mask);
-                    cv::bitwise_xor(a1, b1, d);
-                }
-                Mat d1;
-                d.convertTo(d1, depth);
-                EXPECT_LE(cvtest::norm(c, d1, CV_C), DBL_EPSILON);
-            }
+        if( op == 0 )
+        {
+            cv::add(a, b, c, mask);
+            cv::add(a1, b1, d);
+        }
+        else if( op == 1 )
+        {
+            cv::subtract(a, b, c, mask);
+            cv::subtract(a1, b1, d);
+        }
+        else if( op == 2 )
+        {
+            cv::bitwise_and(a, b, c, mask);
+            cv::bitwise_and(a1, b1, d);
+        }
+        else if( op == 3 )
+        {
+            cv::bitwise_or(a, b, c, mask);
+            cv::bitwise_or(a1, b1, d);
+        }
+        else if( op == 4 )
+        {
+            cv::bitwise_xor(a, b, c, mask);
+            cv::bitwise_xor(a1, b1, d);
+        }
+        Mat d1;
+        d.convertTo(d1, depth);
+        EXPECT_LE(cvtest::norm(c, d1, CV_C), DBL_EPSILON);
+    }
 
-            Mat_<uchar> tmpSrc(100,100);
-            tmpSrc = 124;
-            Mat_<uchar> tmpMask(100,100);
-            tmpMask = 255;
-            Mat_<uchar> tmpDst(100,100);
-            tmpDst = 2;
-            tmpSrc.copyTo(tmpDst,tmpMask);
+    Mat_<uchar> tmpSrc(100,100);
+    tmpSrc = 124;
+    Mat_<uchar> tmpMask(100,100);
+    tmpMask = 255;
+    Mat_<uchar> tmpDst(100,100);
+    tmpDst = 2;
+    tmpSrc.copyTo(tmpDst,tmpMask);
 }
 
 TEST(Multiply, FloatingPointRounding)
@@ -2273,35 +2297,35 @@ TEST(Core_minMaxIdx, regression_9207_2)
     const int rows = 13;
     const int cols = 15;
     uchar mask_[rows*cols] = {
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
-   0, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
- 255,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0, 255,
- 255,   0,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0, 255, 255,
- 255,   0,   0,   0,   0,   0,   0, 255, 255,   0,   0, 255, 255, 255,   0,
- 255,   0,   0,   0,   0,   0,   0,   0,   0, 255, 255, 255,   0, 255,   0,
- 255,   0,   0,   0,   0,   0,   0, 255, 255,   0,   0,   0, 255, 255,   0,
- 255,   0,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0, 255,   0,
- 255,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-   0, 255,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-   0, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
-};
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
+       0, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
+     255,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0, 255,
+     255,   0,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0, 255, 255,
+     255,   0,   0,   0,   0,   0,   0, 255, 255,   0,   0, 255, 255, 255,   0,
+     255,   0,   0,   0,   0,   0,   0,   0,   0, 255, 255, 255,   0, 255,   0,
+     255,   0,   0,   0,   0,   0,   0, 255, 255,   0,   0,   0, 255, 255,   0,
+     255,   0,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0, 255,   0,
+     255,   0,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+       0, 255,   0,   0,   0, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+       0, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
+    };
     uchar src_[15*13] = {
-   5,   5,   5,   5,   5,   6,   5,   2,   0,   4,   6,   6,   4,   1,   0,
-   6,   5,   4,   4,   5,   6,   6,   5,   2,   0,   4,   6,   5,   2,   0,
-   3,   2,   1,   1,   2,   4,   6,   6,   4,   2,   3,   4,   4,   2,   0,
-   1,   0,   0,   0,   0,   1,   4,   5,   4,   4,   4,   4,   3,   2,   0,
-   0,   0,   0,   0,   0,   0,   2,   3,   4,   4,   4,   3,   2,   1,   0,
-   0,   0,   0,   0,   0,   0,   0,   2,   3,   4,   3,   2,   1,   0,   0,
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   0,   0,   0,   1,
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   0,   0,   1,
-   0,   0,   0,   0,   0,   0,   0,   1,   2,   4,   3,   3,   1,   0,   1,
-   0,   0,   0,   0,   0,   0,   1,   4,   5,   6,   5,   4,   3,   2,   0,
-   1,   0,   0,   0,   0,   0,   3,   5,   5,   4,   3,   4,   4,   3,   0,
-   2,   0,   0,   0,   0,   2,   5,   6,   5,   2,   2,   5,   4,   3,   0
-};
+       5,   5,   5,   5,   5,   6,   5,   2,   0,   4,   6,   6,   4,   1,   0,
+       6,   5,   4,   4,   5,   6,   6,   5,   2,   0,   4,   6,   5,   2,   0,
+       3,   2,   1,   1,   2,   4,   6,   6,   4,   2,   3,   4,   4,   2,   0,
+       1,   0,   0,   0,   0,   1,   4,   5,   4,   4,   4,   4,   3,   2,   0,
+       0,   0,   0,   0,   0,   0,   2,   3,   4,   4,   4,   3,   2,   1,   0,
+       0,   0,   0,   0,   0,   0,   0,   2,   3,   4,   3,   2,   1,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   0,   0,   0,   1,
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
+       0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   0,   0,   1,
+       0,   0,   0,   0,   0,   0,   0,   1,   2,   4,   3,   3,   1,   0,   1,
+       0,   0,   0,   0,   0,   0,   1,   4,   5,   6,   5,   4,   3,   2,   0,
+       1,   0,   0,   0,   0,   0,   3,   5,   5,   4,   3,   4,   4,   3,   0,
+       2,   0,   0,   0,   0,   2,   5,   6,   5,   2,   2,   5,   4,   3,   0
+    };
     Mat mask(Size(cols, rows), CV_8UC1, mask_);
     Mat src(Size(cols, rows), CV_8UC1, src_);
     double minVal = -0.0, maxVal = -0.0;
@@ -2715,7 +2739,6 @@ TEST(Core_CartPolar, inplace)
     EXPECT_THROW(cv::polarToCart(uA[0], uA[1], uA[1], uA[0]), cv::Exception);
     EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
     EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
-
 }
 
 }} // namespace
diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp
index 05d1f3062c..a1d40e0ac9 100644
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@@ -589,7 +589,7 @@ void CxCore_DXTBaseTest::get_test_array_types_and_sizes( int test_case_idx,
     {
         if( cn == 1 )
         {
-            types[OUTPUT][0] = depth + 8;
+            types[OUTPUT][0] = CV_MAKETYPE(depth, 2);
             sizes[TEMP][0] = size;
         }
         sizes[INPUT][0] = sizes[INPUT][1] = size;
@@ -597,7 +597,7 @@ void CxCore_DXTBaseTest::get_test_array_types_and_sizes( int test_case_idx,
     }
     else if( /*(cn == 2 && (bits&32)) ||*/ (cn == 1 && allow_complex) )
     {
-        types[TEMP][0] = depth + 8; // CV_??FC2
+        types[TEMP][0] = CV_MAKETYPE(depth, 2); // CV_??FC2
         sizes[TEMP][0] = size;
         size = cvSize(size.width/2+1, size.height);
 
@@ -614,7 +614,7 @@ void CxCore_DXTBaseTest::get_test_array_types_and_sizes( int test_case_idx,
         else
         {
             if( allow_complex )
-                types[OUTPUT][0] = depth + 8;
+                types[OUTPUT][0] = CV_MAKETYPE(depth, 2);
 
             if( cn == 2 )
             {
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 5e1f6d7a8e..4def1a0a0a 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -680,7 +680,9 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
                 reference.read(&reference_data[0], ref_sz);
                 reference.close();
 
-                EXPECT_EQ(reference_data, test_data);
+                if (useMemory) {
+                    EXPECT_EQ(reference_data, test_data);
+                }
             }
             std::cout << "Storage size: " << sz << std::endl;
             EXPECT_LE(sz, (size_t)6000);
@@ -736,16 +738,14 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
         {
             for (int j = 0; j < _2d_out.cols; ++j)
             {
-                EXPECT_EQ(_2d_in.at<cv::Vec3b>(i, j), _2d_out.at<cv::Vec3b>(i, j));
-                if (::testing::Test::HasNonfatalFailure())
-                {
+                if (_2d_in.at<cv::Vec3b>(i, j) != _2d_out.at<cv::Vec3b>(i, j)) {
+                    EXPECT_EQ(_2d_in.at<cv::Vec3b>(i, j), _2d_out.at<cv::Vec3b>(i, j));
                     printf("i = %d, j = %d\n", i, j);
-                    errors++;
-                }
-                if (errors >= 3)
-                {
-                    i = _2d_out.rows;
-                    break;
+                    if (++errors >= 3)
+                    {
+                        i = _2d_out.rows;
+                        break;
+                    }
                 }
             }
         }
@@ -760,7 +760,10 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
         ASSERT_EQ(_rd_in.cols   , _rd_out.cols);
         ASSERT_EQ(_rd_in.dims   , _rd_out.dims);
         ASSERT_EQ(_rd_in.depth(), _rd_out.depth());
-        EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
+
+        if (useMemory) {
+            EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
+        }
     }
 }
 
@@ -1901,15 +1904,25 @@ static void test_20279(FileStorage& fs)
     EXPECT_EQ(CV_16FC3, m16fc3.type()) << typeToString(m16fc3.type());
     //std::cout << m16fc3 << std::endl;
 
+    Mat m16bfc1, m16bfc3;
+    m16fc1.convertTo(m16bfc1, CV_16BF);
+    m16fc3.convertTo(m16bfc3, CV_16BF);
+
     fs << "m16fc1" << m16fc1;
     fs << "m16fc3" << m16fc3;
+    fs << "m16bfc1" << m16bfc1;
+    fs << "m16bfc3" << m16bfc3;
 
     string content = fs.releaseAndGetString();
     if (cvtest::debugLevel > 0) std::cout << content << std::endl;
 
     FileStorage fs_read(content, FileStorage::READ + FileStorage::MEMORY);
+
     Mat m16fc1_result;
     Mat m16fc3_result;
+    Mat m16bfc1_result;
+    Mat m16bfc3_result;
+
     fs_read["m16fc1"] >> m16fc1_result;
     ASSERT_FALSE(m16fc1_result.empty());
     EXPECT_EQ(CV_16FC1, m16fc1_result.type()) << typeToString(m16fc1_result.type());
@@ -1919,6 +1932,16 @@ static void test_20279(FileStorage& fs)
     ASSERT_FALSE(m16fc3_result.empty());
     EXPECT_EQ(CV_16FC3, m16fc3_result.type()) << typeToString(m16fc3_result.type());
     EXPECT_LE(cvtest::norm(m16fc3_result, m16fc3, NORM_INF), 1e-2);
+
+    fs_read["m16bfc1"] >> m16bfc1_result;
+    ASSERT_FALSE(m16bfc1_result.empty());
+    EXPECT_EQ(CV_16BFC1, m16bfc1_result.type()) << typeToString(m16bfc1_result.type());
+    EXPECT_LE(cvtest::norm(m16bfc1_result, m16bfc1, NORM_INF), 2e-2);
+
+    fs_read["m16bfc3"] >> m16bfc3_result;
+    ASSERT_FALSE(m16bfc3_result.empty());
+    EXPECT_EQ(CV_16BFC3, m16bfc3_result.type()) << typeToString(m16bfc3_result.type());
+    EXPECT_LE(cvtest::norm(m16bfc3_result, m16bfc3, NORM_INF), 2e-2);
 }
 
 TEST(Core_InputOutput, FileStorage_16F_xml)
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index 8ed0afe771..f508f51ac4 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -31,12 +31,12 @@ TEST(Core_OutputArrayCreate, _1997)
     ASSERT_NO_THROW(local::create( mat(Rect(Point(), submatSize)), submatSize, mat.type() ));
 }
 
-TEST(Core_SaturateCast, NegativeNotClipped)
+TEST(Core_SaturateCast, NegativesAreClipped)
 {
     double d = -1.0;
     unsigned int val = cv::saturate_cast<unsigned int>(d);
 
-    ASSERT_EQ(0xffffffff, val);
+    ASSERT_EQ(0u, val);
 }
 
 template<typename T, typename U>
diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index 873292bc65..5ccf0f53d5 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -216,19 +216,19 @@ public class ImgprocTest extends OpenCVTestCase {
 
     public void testBoxFilterMatMatIntSize() {
         Size size = new Size(3, 3);
-        Imgproc.boxFilter(gray0, dst, 8, size);
+        Imgproc.boxFilter(gray0, dst, 0, size);
         assertMatEqual(gray0, dst);
         // TODO_: write better test
     }
 
     public void testBoxFilterMatMatIntSizePointBoolean() {
-        Imgproc.boxFilter(gray255, dst, 8, size, anchorPoint, false);
+        Imgproc.boxFilter(gray255, dst, 0, size, anchorPoint, false);
         assertMatEqual(gray255, dst);
         // TODO_: write better test
     }
 
     public void testBoxFilterMatMatIntSizePointBooleanInt() {
-        Imgproc.boxFilter(gray255, dst, 8, size, anchorPoint, false, Core.BORDER_REFLECT);
+        Imgproc.boxFilter(gray255, dst, 0, size, anchorPoint, false, Core.BORDER_REFLECT);
         assertMatEqual(gray255, dst);
         // TODO_: write better test
     }
diff --git a/modules/imgproc/test/test_pc.cpp b/modules/imgproc/test/test_pc.cpp
index 7b06e3bd65..173866ac58 100644
--- a/modules/imgproc/test/test_pc.cpp
+++ b/modules/imgproc/test/test_pc.cpp
@@ -186,10 +186,10 @@ void CV_DivSpectrumsTest::get_test_array_types_and_sizes( int test_case_idx, vec
     // Inputs are CCS-packed arrays.  Prepare outputs and temporary inputs as complex matrices.
     if( type == CV_32FC1 || type == CV_64FC1 )
     {
-        types[OUTPUT][0] += 8;
-        types[REF_OUTPUT][0] += 8;
-        types[TEMP][0] += 8;
-        types[TEMP][1] += 8;
+        types[OUTPUT][0] += CV_DEPTH_MAX;
+        types[REF_OUTPUT][0] += CV_DEPTH_MAX;
+        types[TEMP][0] += CV_DEPTH_MAX;
+        types[TEMP][1] += CV_DEPTH_MAX;
     }
 }
 
diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp
index 59542d95ba..40cb58fd15 100644
--- a/modules/stitching/src/exposure_compensate.cpp
+++ b/modules/stitching/src/exposure_compensate.cpp
@@ -129,7 +129,7 @@ void GainCompensator::singleFeed(const std::vector<Point> &corners, const std::v
     const int num_images = static_cast<int>(images.size());
     Mat_<int> N(num_images, num_images); N.setTo(0);
     Mat_<double> I(num_images, num_images); I.setTo(0);
-    Mat_<bool> skip(num_images, 1); skip.setTo(true);
+    Mat_<uchar> skip(num_images, 1); skip.setTo(1);
 
     Mat subimg1, subimg2;
     Mat_<uchar> submask1, submask2, intersect;
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index cd02766148..4c6cf95858 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -72,10 +72,10 @@ int randomType(RNG& rng, _OutputArray::DepthMask typeMask, int minChannels, int
 {
     int channels = rng.uniform(minChannels, maxChannels+1);
     int depth = 0;
-    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0);
+    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0);
     for(;;)
     {
-        depth = rng.uniform(CV_8U, CV_16F+1);
+        depth = rng.uniform(CV_8U, CV_DEPTH_CURR_MAX);
         if( ((1 << depth) & typeMask) != 0 )
             break;
     }
@@ -246,8 +246,43 @@ convert_(const _Tp1* src, _Tp2* dst, size_t total, double alpha, double beta)
             dst[i] = saturate_cast<_Tp2>(src[i]*alpha + beta);
 }
 
+template<typename _Tp1> inline void
+convert_to_bool(const _Tp1* src, bool* dst,
+                size_t total, double alpha, double beta)
+{
+    size_t i;
+    if( alpha == 1 && beta == 0 )
+        for( i = 0; i < total; i++ )
+            dst[i] = src[i] != 0;
+    else if( beta == 0 )
+        for( i = 0; i < total; i++ )
+            dst[i] = src[i]*alpha != 0;
+    else
+        for( i = 0; i < total; i++ )
+            dst[i] = src[i]*alpha + beta != 0;
+}
+
+template<typename _Tp2>
+inline void
+convert_(const bool* src_, _Tp2* dst,
+         size_t total, double alpha, double beta)
+{
+    size_t i;
+    const uint8_t* src = (const uint8_t*)src_;
+    if( alpha == 1 && beta == 0 )
+        for( i = 0; i < total; i++ )
+            dst[i] = saturate_cast<_Tp2>(src[i] != 0);
+    else if( beta == 0 )
+        for( i = 0; i < total; i++ )
+            dst[i] = saturate_cast<_Tp2>((src[i] != 0)*alpha);
+    else
+        for( i = 0; i < total; i++ )
+            dst[i] = saturate_cast<_Tp2>((src[i] != 0)*alpha + beta);
+}
+
 template<typename _Tp> inline void
-convertTo(const _Tp* src, void* dst, int dtype, size_t total, double alpha, double beta)
+convertTo(const _Tp* src, void* dst, int dtype,
+          size_t total, double alpha, double beta)
 {
     switch( CV_MAT_DEPTH(dtype) )
     {
@@ -263,6 +298,9 @@ convertTo(const _Tp* src, void* dst, int dtype, size_t total, double alpha, doub
     case CV_16S:
         convert_(src, (short*)dst, total, alpha, beta);
         break;
+    case CV_32U:
+        convert_(src, (unsigned*)dst, total, alpha, beta);
+        break;
     case CV_32S:
         convert_(src, (int*)dst, total, alpha, beta);
         break;
@@ -272,16 +310,35 @@ convertTo(const _Tp* src, void* dst, int dtype, size_t total, double alpha, doub
     case CV_64F:
         convert_(src, (double*)dst, total, alpha, beta);
         break;
+    case CV_64U:
+        convert_(src, (uint64_t*)dst, total, alpha, beta);
+        break;
+    case CV_64S:
+        convert_(src, (int64_t*)dst, total, alpha, beta);
+        break;
+    case CV_16F:
+        convert_(src, (cv::float16_t*)dst, total, alpha, beta);
+        break;
+    case CV_16BF:
+        convert_(src, (cv::bfloat16_t*)dst, total, alpha, beta);
+        break;
+    case CV_Bool:
+        convert_to_bool(src, (bool*)dst, total, alpha, beta);
+        break;
     default:
         CV_Assert(0);
     }
 }
 
-void convert(const Mat& src, cv::OutputArray _dst, int dtype, double alpha, double beta)
+void convert(const Mat& src, cv::OutputArray _dst,
+             int dtype, double alpha, double beta)
 {
     if (dtype < 0) dtype = _dst.depth();
 
-    dtype = CV_MAKETYPE(CV_MAT_DEPTH(dtype), src.channels());
+    int sdepth = src.depth();
+    int ddepth = CV_MAT_DEPTH(dtype);
+
+    dtype = CV_MAKETYPE(ddepth, src.channels());
     _dst.create(src.dims, &src.size[0], dtype);
     Mat dst = _dst.getMat();
     if( alpha == 0 )
@@ -307,7 +364,7 @@ void convert(const Mat& src, cv::OutputArray _dst, int dtype, double alpha, doub
         const uchar* sptr = planes[0].ptr();
         uchar* dptr = planes[1].ptr();
 
-        switch( src.depth() )
+        switch( sdepth )
         {
         case CV_8U:
             convertTo((const uchar*)sptr, dptr, dtype, total, alpha, beta);
@@ -315,12 +372,18 @@ void convert(const Mat& src, cv::OutputArray _dst, int dtype, double alpha, doub
         case CV_8S:
             convertTo((const schar*)sptr, dptr, dtype, total, alpha, beta);
             break;
+        case CV_Bool:
+            convertTo((const bool*)sptr, dptr, dtype, total, alpha, beta);
+            break;
         case CV_16U:
             convertTo((const ushort*)sptr, dptr, dtype, total, alpha, beta);
             break;
         case CV_16S:
             convertTo((const short*)sptr, dptr, dtype, total, alpha, beta);
             break;
+        case CV_32U:
+            convertTo((const unsigned*)sptr, dptr, dtype, total, alpha, beta);
+            break;
         case CV_32S:
             convertTo((const int*)sptr, dptr, dtype, total, alpha, beta);
             break;
@@ -330,6 +393,20 @@ void convert(const Mat& src, cv::OutputArray _dst, int dtype, double alpha, doub
         case CV_64F:
             convertTo((const double*)sptr, dptr, dtype, total, alpha, beta);
             break;
+        case CV_64U:
+            convertTo((const uint64_t*)sptr, dptr, dtype, total, alpha, beta);
+            break;
+        case CV_64S:
+            convertTo((const int64_t*)sptr, dptr, dtype, total, alpha, beta);
+            break;
+        case CV_16F:
+            convertTo((const cv::float16_t*)sptr, dptr, dtype, total, alpha, beta);
+            break;
+        case CV_16BF:
+            convertTo((const cv::bfloat16_t*)sptr, dptr, dtype, total, alpha, beta);
+            break;
+        default:
+            CV_Error(CV_StsNotImplemented, "unknown/unsupported depth");
         }
     }
 }
@@ -1351,7 +1428,7 @@ double norm(InputArray _src, int normType, InputArray _mask)
 double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
 {
     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
-    if( src1.depth() == CV_16F )
+    if( src1.depth() == CV_16F || src1.depth() == CV_16BF )
     {
         Mat src1_32f, src2_32f;
         src1.convertTo(src1_32f, CV_32F);
@@ -1769,10 +1846,10 @@ cmpUlpsInt_(const _Tp* src1, const _Tp* src2, size_t total, int imaxdiff,
            size_t startidx, size_t& idx)
 {
     size_t i;
-    int realmaxdiff = 0;
+    int64_t realmaxdiff = 0;
     for( i = 0; i < total; i++ )
     {
-        int diff = std::abs(src1[i] - src2[i]);
+        int64_t diff = (int64_t)std::abs((int64_t)src1[i] - (int64_t)src2[i]);
         if( realmaxdiff < diff )
         {
             realmaxdiff = diff;
@@ -1780,7 +1857,7 @@ cmpUlpsInt_(const _Tp* src1, const _Tp* src2, size_t total, int imaxdiff,
                 idx = i + startidx;
         }
     }
-    return realmaxdiff;
+    return (double)realmaxdiff;
 }
 
 
@@ -2008,7 +2085,7 @@ int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
 {
     Mat arr = arr_, refarr = refarr_;
     CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
-    if( arr.depth() == CV_16F )
+    if( arr.depth() == CV_16F || arr.depth() == CV_16BF )
     {
         Mat arr32f, refarr32f;
         arr.convertTo(arr32f, CV_32F);
@@ -2017,7 +2094,8 @@ int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
         refarr = refarr32f;
     }
 
-    int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
+    int depth = refarr.depth();
+    int ilevel = depth <= CV_32S || depth == CV_32U || depth == CV_64U || depth == CV_64S ? cvFloor(success_err_level) : 0;
     int result = CMP_EPS_OK;
 
     const Mat *arrays[]={&arr, &refarr, 0};
@@ -2025,14 +2103,13 @@ int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
     NAryMatIterator it(arrays, planes);
     size_t total = planes[0].total()*planes[0].channels(), j = total;
     size_t i, nplanes = it.nplanes;
-    int depth = arr.depth();
     size_t startidx = 1, idx = 0;
     double realmaxdiff = 0, maxval = 0;
 
     if(_realmaxdiff)
         *_realmaxdiff = 0;
 
-    if( refarr.depth() >= CV_32F && !element_wise_relative_error )
+    if( !CV_IS_INT_TYPE(depth) && !element_wise_relative_error )
     {
         maxval = cvtest::norm( refarr, NORM_INF );
         maxval = MAX(maxval, 1.);
@@ -2048,6 +2125,9 @@ int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
         case CV_8U:
             realmaxdiff = cmpUlpsInt_((const uchar*)sptr1, (const uchar*)sptr2, total, ilevel, startidx, idx);
             break;
+        case CV_Bool:
+            realmaxdiff = cmpUlpsInt_((const uchar*)sptr1, (const uchar*)sptr2, total, ilevel, startidx, idx);
+            break;
         case CV_8S:
             realmaxdiff = cmpUlpsInt_((const schar*)sptr1, (const schar*)sptr2, total, ilevel, startidx, idx);
             break;
@@ -2060,6 +2140,15 @@ int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
         case CV_32S:
             realmaxdiff = cmpUlpsInt_((const int*)sptr1, (const int*)sptr2, total, ilevel, startidx, idx);
             break;
+        case CV_32U:
+            realmaxdiff = cmpUlpsInt_((const unsigned*)sptr1, (const unsigned*)sptr2, total, ilevel, startidx, idx);
+            break;
+        case CV_64S:
+            realmaxdiff = cmpUlpsInt_((const int64_t*)sptr1, (const int64_t*)sptr2, total, ilevel, startidx, idx);
+            break;
+        case CV_64U:
+            realmaxdiff = cmpUlpsInt_((const uint64_t*)sptr1, (const uint64_t*)sptr2, total, ilevel, startidx, idx);
+            break;
         case CV_32F:
             for( j = 0; j < total; j++ )
             {
@@ -2887,7 +2976,7 @@ std::ostream& operator << (std::ostream& out, const MatInfo& m)
         out << "<Empty>";
     else
     {
-        static const char* depthstr[] = {"8u", "8s", "16u", "16s", "32s", "32f", "64f", "?"};
+        static const char* depthstr[] = {"8u", "8s", "16u", "16s", "32s", "32f", "64f", "16f", "16bf", "Bool", "64u", "64s", "32u", "?", "?", "?"};
         out << depthstr[m.m->depth()] << "C" << m.m->channels() << " " << m.m->dims << "-dim (";
         for( int i = 0; i < m.m->dims; i++ )
             out << m.m->size[i] << (i < m.m->dims-1 ? " x " : ")");
@@ -2930,7 +3019,6 @@ writeElems(std::ostream& out, const void* data, int nelems, int starpos)
     }
 }
 
-
 static void writeElems(std::ostream& out, const void* data, int nelems, int depth, int starpos)
 {
     if(depth == CV_8U)
@@ -2943,6 +3031,28 @@ static void writeElems(std::ostream& out, const void* data, int nelems, int dept
         writeElems<short, int>(out, data, nelems, starpos);
     else if(depth == CV_32S)
         writeElems<int, int>(out, data, nelems, starpos);
+    else if(depth == CV_32U)
+        writeElems<unsigned, unsigned>(out, data, nelems, starpos);
+    else if(depth == CV_64U)
+        writeElems<uint64_t, uint64_t>(out, data, nelems, starpos);
+    else if(depth == CV_64S)
+        writeElems<int64_t, int64_t>(out, data, nelems, starpos);
+    else if(depth == CV_Bool)
+        writeElems<bool, int>(out, data, nelems, starpos);
+    else if(depth == CV_16F)
+    {
+        std::streamsize pp = out.precision();
+        out.precision(4);
+        writeElems<cv::float16_t, float>(out, data, nelems, starpos);
+        out.precision(pp);
+    }
+    else if(depth == CV_16BF)
+    {
+        std::streamsize pp = out.precision();
+        out.precision(4);
+        writeElems<cv::bfloat16_t, float>(out, data, nelems, starpos);
+        out.precision(pp);
+    }
     else if(depth == CV_32F)
     {
         std::streamsize pp = out.precision();
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 39147228b8..09b9ac1a9b 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -465,6 +465,15 @@ void Regression::verify(cv::FileNode node, cv::InputArray array, double eps, ERR
 {
     int expected_kind = (int)node["kind"];
     int expected_type = (int)node["type"];
+    int array_type = array.type();
+    if (array_type != expected_type) {
+        // temporary hack; we optimistically assume that type in the computed and expected array should be the same.
+        // if they are different, it must be because of the change in type representation between OpenCV 5.x and OpenCV 2.x,3.x,4.x.
+        // need to add "type5" or something like that and use it in the newer files. Then type will always mean 'earlier than 5.x type'.
+        int depth = expected_type & 7;
+        int channels = ((expected_type >> 3) & 127) + 1;
+        expected_type = CV_MAKETYPE(depth, channels);
+    }
     ASSERT_EQ(expected_kind, array.kind()) << "  Argument \"" << node.name() << "\" has unexpected kind";
     ASSERT_EQ(expected_type, array.type()) << "  Argument \"" << node.name() << "\" has unexpected type";
 
diff --git a/modules/videoio/src/backend_plugin.cpp b/modules/videoio/src/backend_plugin.cpp
index 71756ac158..5e65137cd4 100644
--- a/modules/videoio/src/backend_plugin.cpp
+++ b/modules/videoio/src/backend_plugin.cpp
@@ -535,6 +535,12 @@ public:
         cv::_OutputArray* dst = static_cast<cv::_OutputArray*>(userdata);
         if (!dst)
             return CV_ERROR_FAIL;
+        int depth = CV_MAT_DEPTH(type);
+        // [TODO] Remove this condition after rebuilding plugins or add a new
+        // version of plugins. Convert type from the old one to the new one (5 bits)
+        if (depth > 7) {
+            type = CV_MAKETYPE((type & 7), (type >> 3) + 1);
+        }
         cv::Mat(cv::Size(width, height), type, (void*)data, step).copyTo(*dst);
         return CV_ERROR_OK;
     }
diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp
index b4f340897e..9bd613d8f0 100644
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@@ -54,7 +54,11 @@ static inline void PrintTo(const cv::VideoCaptureAPIs& api, std::ostream* os)
 
 inline std::string fourccToString(int fourcc)
 {
-    return cv::format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
+    return cv::format("%c%c%c%c",
+        (char)(fourcc & 255),
+        (char)((fourcc >> 8) & 255),
+        (char)((fourcc >> 16) & 255),
+        (char)((fourcc >> 24) & 255));
 }
 
 inline std::string fourccToStringSafe(int fourcc)
@@ -71,19 +75,19 @@ inline int fourccFromString(const std::string &fourcc)
     return cv::VideoWriter::fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
 }
 
-inline void generateFrame(int i, int FRAME_COUNT, cv::Mat & frame)
+inline void generateFrame(int i, int frame_count, cv::Mat & frame)
 {
     using namespace cv;
     using namespace std;
-    int offset = (((i * 5) % FRAME_COUNT) - FRAME_COUNT / 2) * (frame.cols / 2) / FRAME_COUNT;
+    int offset = (((i * 5) % frame_count) - frame_count / 2) * (frame.cols / 2) / frame_count;
     frame(cv::Rect(0, 0, frame.cols / 2 + offset, frame.rows)) = Scalar(255, 255, 255);
     frame(cv::Rect(frame.cols / 2 + offset, 0, frame.cols - frame.cols / 2 - offset, frame.rows)) = Scalar(0, 0, 0);
-    ostringstream buf; buf << "Frame " << setw(2) << setfill('0') << i + 1;
+    std::string str = cv::format("%02d", i+1);
     int baseLine = 0;
-    Size box = getTextSize(buf.str(), FONT_HERSHEY_COMPLEX, 2, 5, &baseLine);
-    putText(frame, buf.str(), Point((frame.cols - box.width) / 2, (frame.rows - box.height) / 2 + baseLine),
+    Size box = getTextSize(str, FONT_HERSHEY_COMPLEX, 2, 5, &baseLine);
+    putText(frame, str, Point((frame.cols - box.width) / 2, (frame.rows - box.height) / 2 + baseLine),
             FONT_HERSHEY_COMPLEX, 2, Scalar(0, 0, 255), 5, LINE_AA);
-    Point p(i * frame.cols / (FRAME_COUNT - 1), i * frame.rows / (FRAME_COUNT - 1));
+    Point p(i * frame.cols / (frame_count - 1), i * frame.rows / (frame_count - 1));
     circle(frame, p, 50, Scalar(200, 25, 55), 8, LINE_AA);
 #if 0
     imshow("frame", frame);