Merge branch 4.x

2025-07-25 14:47:07 +08:00 · 2024-02-12 14:20:35 +03:00 · 2024-02-12 14:20:35 +03:00 · 3a55f50133
commit 3a55f50133
parent 1d18aba587 76548e29bd
131 changed files with 3446 additions and 864 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,6 +65,10 @@ if(POLICY CMP0068)
  cmake_policy(SET CMP0068 NEW)  # CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
 endif()

+if(POLICY CMP0071)
+  cmake_policy(SET CMP0071 NEW)  # CMake 3.10+: Let `AUTOMOC` and `AUTOUIC` process `GENERATED` files.
+endif()
+
 if(POLICY CMP0075)
  cmake_policy(SET CMP0075 NEW)  # CMake 3.12+: Include file check macros honor `CMAKE_REQUIRED_LIBRARIES`
 endif()
@ -1870,7 +1874,7 @@ if(BUILD_JAVA)
    status("    JNI:"         JNI_INCLUDE_DIRS    THEN "${JNI_INCLUDE_DIRS}"                                       ELSE NO)
  endif()
  status("    Java wrappers:" HAVE_opencv_java                                                            THEN "YES (${OPENCV_JAVA_SDK_BUILD_TYPE})" ELSE NO)
-  status("    Java tests:"    BUILD_TESTS AND opencv_test_java_BINARY_DIR                                 THEN YES ELSE NO)
+  status("    Java tests:"    BUILD_TESTS AND (opencv_test_java_BINARY_DIR OR opencv_test_android_BINARY_DIR) THEN YES ELSE NO)
 endif()

 # ========================== Objective-C =======================
--- a/cmake/android/android_gradle_projects.cmake
+++ b/cmake/android/android_gradle_projects.cmake
@ -141,6 +141,8 @@ if (gradle.opencv_source == 'sdk_path') {
 ")

 ocv_check_environment_variables(OPENCV_GRADLE_VERBOSE_OPTIONS)
+ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
+separate_arguments(OPENCV_GRADLE_VERBOSE_OPTIONS UNIX_COMMAND "${OPENCV_GRADLE_VERBOSE_OPTIONS}")

 macro(add_android_project target path)
  get_filename_component(__dir "${path}" NAME)
@ -175,7 +177,6 @@ include ':${__dir}'
  if (BUILD_ANDROID_EXAMPLES)
    # build apk
    set(APK_FILE "${ANDROID_BUILD_BASE_DIR}/${__dir}/build/outputs/apk/release/${__dir}-${ANDROID_ABI}-release-unsigned.apk")
-    ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
    add_custom_command(
        OUTPUT "${APK_FILE}" "${OPENCV_DEPHELPER}/android_sample_${__dir}"
        COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "${__dir}:assemble"
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@ -9,46 +9,224 @@ YOLO DNNs  {#tutorial_dnn_yolo}
 |    |    |
 | -: | :- |
 | Original author | Alessandro de Oliveira Faria |
-| Compatibility | OpenCV >= 3.3.1 |
+| Extended by     | Abduragim Shtanchaev |
+| Compatibility   | OpenCV >= 4.9.0 |

-Introduction
------------

-In this text you will learn how to use opencv_dnn module using yolo_object_detection (Sample of using OpenCV dnn module in real time with device capture, video and image).
+Running pre-trained YOLO model in OpenCV
+----------------------------------------

-We will demonstrate results of this example on the following picture.
-![Picture example](images/yolo.jpg)
+Deploying pre-trained models is a common task in machine learning, particularly when working with
+hardware that does not support certain frameworks like PyTorch. This guide provides a comprehensive
+overview of exporting pre-trained YOLO family models from PyTorch and deploying them using OpenCV's
+DNN framework. For demonstration purposes, we will focus on the [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main)
+model, but the methodology applies to other supported models.

-Examples
--------
+@note Currently, OpenCV supports the following YOLO models:
+- [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main),
+- [YoloNas](https://github.com/Deci-AI/super-gradients/tree/master),
+- [YOLOv8](https://github.com/ultralytics/ultralytics/tree/main),
+- [YOLOv7](https://github.com/WongKinYiu/yolov7/tree/main),
+- [YOLOv6](https://github.com/meituan/YOLOv6/blob/main),
+- [YOLOv5](https://github.com/ultralytics/yolov5),
+- [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4).
+
+This support includes pre and post-processing routines specific to these models. While other older
+version of YOLO are also supported by OpenCV in Darknet format, they are out of the scope of this tutorial.
+
+
+Assuming that we have successfully trained YOLOX model, the subsequent step involves exporting and
+running this model with OpenCV. There are several critical considerations to address before
+proceeding with this process. Let's delve into these aspects.
+
+### YOLO's Pre-proccessing & Output
+
+Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal.
+These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input
+sizes contingent upon the model's scale.
+
+| Model Scale  | Input Size   |
+|--------------|--------------|
+| Small Models <sup>[1](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 416x416      |
+| Midsize Models <sup>[2](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 640x640    |
+| Large Models <sup>[3](https://github.com/meituan/YOLOv6/tree/main#benchmark)</sup>| 1280x1280    |
+
+This table provides a quick reference to understand the different input dimensions commonly used in
+various YOLO models inputs. These are standard input shapes. Make sure you use input size that you
+trained model with, if it is differed from from the size mentioned in the table.
+
+The next critical element in the process involves understanding the specifics of image pre-processing
+for YOLO detectors. While the fundamental pre-processing approach remains consistent across the YOLO
+family, there are subtle yet crucial differences that must be accounted for to avoid any degradation
+in performance. Key among these are the `resize type` and the `padding value` applied post-resize.
+For instance, the [YOLOX model](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/data/data_augment.py#L142)
+utilizes a `LetterBox` resize method and a padding value of `114.0`. It is imperative to ensure that
+these parameters, along with the normalization constants, are appropriately matched to the model being
+exported.
+
+Regarding the model's output, it typically takes the form of a tensor with dimensions [BxNxC+5] or
+[BxNxC+4], where 'B' represents the batch size, 'N' denotes the number of anchors, and 'C' signifies
+the number of classes (for instance, 80 classes if the model is trained on the COCO dataset).
+The additional 5 in the former tensor structure corresponds to the objectness score (obj), confidence
+score (conf), and the bounding box coordinates (cx, cy, w, h). Notably, the YOLOv8 model's output
+is shaped as [BxNxC+4], where there is no explicit objectness score, and the object score is directly
+inferred from the class score. For the YOLOX model, specifically, it is also necessary to incorporate
+anchor points to rescale predictions back to the image domain. This step will be integrated into
+the ONNX graph, a process that we will detail further in the subsequent sections.
+
+
+### PyTorch Model Export
+
+Now that we know know the parameters of the pre-precessing we can go on and export the model from
+Pytorch to ONNX graph. Since in this tutorial we are using YOLOX as our sample model, lets use its
+export for demonstration purposes (the process is  identical for the rest of the YOLO detectors).
+To exporting YOLOX we can just use [export script](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/tools/export_onnx.py). Particularly we need following commands:
+
+@code{.bash}
+git clone https://github.com/Megvii-BaseDetection/YOLOX.git
+cd YOLOX
+wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth # download pre-trained weights
+python3 -m tools.export_onnx --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth --decode_in_inference
+@endcode
+
+**NOTE:** Here `--decode_in_inference` is to include anchor box creation in the ONNX graph itself.
+It sets [this value](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/models/yolo_head.py#L210C16-L210C39)
+to `True`, which subsequently includes anchor generation function.
+
+Below we demonstrated the minimal version of the export script (which could be used for models other
+than YOLOX) in case it is needed. However, usually each YOLO repository has predefined export script.
+
+@code{.py}
+    import onnx
+    import torch
+    from onnxsim import simplify
+
+    # load the model state dict
+    ckpt = torch.load(ckpt_file, map_location="cpu")
+    model.load_state_dict(ckpt)
+
+    # prepare dummy input
+    dummy_input = torch.randn(args.batch_size, 3, exp.test_size[0], exp.test_size[1])
+
+    #export the model
+    torch.onnx._export(
+        model,
+        dummy_input,
+        "yolox.onnx",
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={"input": {0: 'batch'},
+                      "output": {0: 'batch'}})
+
+    # use onnx-simplifier to reduce reduent model.
+    onnx_model = onnx.load(args.output_name)
+    model_simp, check = simplify(onnx_model)
+    assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model_simp, args.output_name)
+@endcode
+
+### Running Yolo ONNX detector with OpenCV Sample
+
+Once we have our ONNX graph of the model, we just simply can run with OpenCV's sample. To that we need to make sure:
+
+1. OpenCV is build with -DBUILD_EXAMLES=ON flag.
+2. Navigate to the OpenCV's `build` directory
+3. Run the following command:
+
+@code{.cpp}
+./bin/example_dnn_yolo_detector --input=<path_to_your_input_file> \
+                                --classes=<path_to_class_names_file> \
+                                --thr=<confidence_threshold> \
+                                --nms=<non_maximum_suppression_threshold> \
+                                --mean=<mean_normalization_value> \
+                                --scale=<scale_factor> \
+                                --yolo=<yolo_model_version> \
+                                --padvalue=<padding_value> \
+                                --paddingmode=<padding_mode> \
+                                --backend=<computation_backend> \
+                                --target=<target_computation_device>
+@endcode

 VIDEO DEMO:
@youtube{NHtRlndE2cg}

-Source Code
-----------
+- --input: File path to your input image or video. If omitted, it will capture frames from a camera.
+- --classes: File path to a text file containing class names for object detection.
+- --thr: Confidence threshold for detection (e.g., 0.5).
+- --nms: Non-maximum suppression threshold (e.g., 0.4).
+- --mean: Mean normalization value (e.g., 0.0 for no mean normalization).
+- --scale: Scale factor for input normalization (e.g., 1.0).
+- --yolo: YOLO model version (e.g., YOLOv3, YOLOv4, etc.).
+- --padvalue: Padding value used in pre-processing (e.g., 114.0).
+- --paddingmode: Method for handling image resizing and padding. Options: 0 (resize without extra processing), 1 (crop after resize), 2 (resize with aspect ratio preservation).
+- --backend: Selection of computation backend (0 for automatic, 1 for Halide, 2 for OpenVINO, etc.).
+- --target: Selection of target computation device (0 for CPU, 1 for OpenCL, etc.).
+- --device: Camera device number (0 for default camera). If `--input` is not provided camera with index 0 will used by default.

-Use a universal sample for object detection models written
-[in C++](https://github.com/opencv/opencv/blob/5.x/samples/dnn/object_detection.cpp) and
-[in Python](https://github.com/opencv/opencv/blob/5.x/samples/dnn/object_detection.py) languages
+Here `mean`, `scale`, `padvalue`, `paddingmode` should exactly match those that we discussed
+in pre-processing section in order for the model to match result in PyTorch

-Usage examples
--------------
+To demonstrate how to run OpenCV YOLO samples without your own pretrained model, follow these instructions:

-Execute in webcam:
+1. Ensure Python is installed on your platform.
+2. Confirm that OpenCV is built with the `-DBUILD_EXAMPLES=ON` flag.

-@code{.bash}
-
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb
+Run the YOLOX detector(with default values):

+@code{.sh}
+git clone https://github.com/opencv/opencv_extra.git
+cd opencv_extra/testdata/dnn
+python download_models.py yolox_s_inf_decoder
+cd ..
+export OPENCV_TEST_DATA_PATH=$(pwd)
+cd <build directory of OpenCV>
+./bin/example_dnn_yolo_detector
@endcode

-Execute with image or video file:
+This will execute the YOLOX detector with your camera. For YOLOv8 (for instance), follow these additional steps:

-@code{.bash}
-
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb
+@code{.sh}
+cd opencv_extra/testdata/dnn
+python download_models.py yolov8
+cd ..
+export OPENCV_TEST_DATA_PATH=$(pwd)
+cd <build directory of OpenCV>

+./bin/example_dnn_yolo_detector --model=onnx/models/yolov8n.onnx --yolo=yolov8 --mean=0.0 --scale=0.003921568627 --paddingmode=2 --padvalue=144.0 --thr=0.5 --nms=0.4 --rgb=0
@endcode

-Questions and suggestions email to: Alessandro de Oliveira Faria cabelo@opensuse.org or OpenCV Team.
+
+### Building a Custom Pipeline
+
+Sometimes there is a need to make some custom adjustments in the inference pipeline. With OpenCV DNN
+module this is also quite easy to achieve. Below we will outline the sample implementation details:
+
+- Import required libraries
+
+@snippet samples/dnn/yolo_detector.cpp includes
+
+- Read ONNX graph and create neural network model:
+
+@snippet samples/dnn/yolo_detector.cpp read_net
+
+- Read image and pre-process it:
+
+@snippet samples/dnn/yolo_detector.cpp preprocess_params
+@snippet samples/dnn/yolo_detector.cpp preprocess_call
+@snippet samples/dnn/yolo_detector.cpp preprocess_call_func
+
+- Inference:
+
+@snippet samples/dnn/yolo_detector.cpp forward_buffers
+@snippet samples/dnn/yolo_detector.cpp forward
+
+- Post-Processing
+
+All post-processing steps are implemented in function `yoloPostProcess`. Please pay attention,
+that NMS step is not included into onnx graph. Sample uses OpenCV function for it.
+
+@snippet samples/dnn/yolo_detector.cpp postprocess
+
+- Draw predicted boxes
+
+@snippet samples/dnn/yolo_detector.cpp draw_boxes
--- a/modules/calib/src/calibinit.cpp
+++ b/modules/calib/src/calibinit.cpp
@ -156,7 +156,8 @@ struct ChessBoardQuad
    float edge_len; // quad edge len, in pix^2
    // neighbors and corners are synced, i.e., neighbor 0 shares corner 0
    ChessBoardCorner *corners[4]; // Coordinates of quad corners
-    struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors
+    struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors. M.b. sparse.
+    // Each neighbors element corresponds to quad corner, but not just sequential index.

    ChessBoardQuad(int group_idx_ = -1) :
        count(0),
@ -1701,12 +1702,12 @@ void ChessBoardDetector::findQuadNeighbors()
                    continue;

                // Check that each corner is a neighbor of different quads
-                for(j = 0; j < closest_quad->count; j++ )
+                for(j = 0; j < 4; j++ )
                {
                    if (closest_quad->neighbors[j] == &cur_quad)
                        break;
                }
-                if (j < closest_quad->count)
+                if (j < 4)
                    continue;

                // check whether the closest corner to closest_corner
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -556,6 +556,8 @@ The format of half precision floating point is defined in IEEE 754-2008.

@param src input array.
@param dst output array.
+
+@deprecated Use Mat::convertTo with CV_16F instead.
 */
 CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);

--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -271,11 +271,11 @@ enum BorderTypes {
    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
-    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno` - Treats outliers as transparent.

    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
-    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+    BORDER_ISOLATED    = 16 //!< Interpolation restricted within the ROI boundaries.
 };

 //! @} core_array
--- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
@ -860,6 +860,7 @@ class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
 public:
    inline __m128i operator()(const __m128i& a, const __m128i& b) const
    {
+        CV_UNUSED(b);
        return a;
    }
 };
@ -880,6 +881,7 @@ class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
 public:
    inline __m128i operator()(const __m128i& a, const __m128i& b) const
    {
+        CV_UNUSED(a);
        return b;
    }
 };
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@ -490,12 +490,12 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)

 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+    return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
 }

 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+    return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
 }

 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
@ -553,7 +553,7 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)

 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+    return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
 }

 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
@ -1429,7 +1429,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1438,7 +1438,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
@ -1469,7 +1469,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@ -1478,7 +1478,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
@ -2034,30 +2034,23 @@ void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)

-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif

 // saturating multiply 8-bit, 16-bit
-#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
-    {                                                            \
-        _Tpwvec c, d;                                            \
-        v_mul_expand(a, b, c, d);                                \
-        return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+    {                                                           \
+        auto res = mul(a.val, b.val, num);                      \
+        return _Tpvec(cvt(res, 0, num));                        \
+    }                                                           \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)     \
    { a = a * b; return a; }

-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  32, vwmul_vv_i32m2, vnclip_wx_i16m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
+

-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
 static const signed char popCountTable[256] =
 {
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -127,6 +127,11 @@ public:
    CV_WRAP int singleFPConfig() const;
    CV_WRAP int halfFPConfig() const;

+    /// true if 'cl_khr_fp64' extension is available
+    CV_WRAP bool hasFP64() const;
+    /// true if 'cl_khr_fp16' extension is available
+    CV_WRAP bool hasFP16() const;
+
    CV_WRAP bool endianLittle() const;
    CV_WRAP bool errorCorrectionSupport() const;

--- a/modules/core/include/opencv2/core/opencl/opencl_info.hpp
+++ b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
@ -141,13 +141,13 @@ static void dumpOpenCLInformation()
        DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());

-        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());

-        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());

        const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@ -558,7 +558,7 @@ public:
    //! returns the minimal up-right integer rectangle containing the rotated rectangle
    CV_WRAP Rect boundingRect() const;
    //! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
-    Rect_<float> boundingRect2f() const;
+    CV_WRAP Rect2f boundingRect2f() const;
    //! returns the rectangle mass center
    CV_PROP_RW Point2f center;
    //! returns width and height of the rectangle
--- a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
+++ b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
@ -12,7 +12,7 @@

 #if defined(_WIN32)
 #include <windows.h>
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
 #include <dlfcn.h>
 #endif

@ -65,7 +65,7 @@ void* getSymbol_(LibHandle_t h, const char* symbolName)
 {
 #if defined(_WIN32)
    return (void*)GetProcAddress(h, symbolName);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
    return dlsym(h, symbolName);
 #endif
 }
@ -79,7 +79,7 @@ LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
 # else
    return LoadLibraryW(filename.c_str());
 #endif
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
    void* handle = dlopen(filename.c_str(), RTLD_NOW);
    CV_LOG_IF_DEBUG(NULL, !handle, "dlopen() error: " << dlerror());
    return handle;
@ -91,7 +91,7 @@ void libraryRelease_(LibHandle_t h)
 {
 #if defined(_WIN32)
    FreeLibrary(h);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
    dlclose(h);
 #endif
 }
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@ -80,6 +80,187 @@ OCL_PERF_TEST_P(ConvertToFixture, ConvertTo,
    SANITY_CHECK(dst);
 }

+
+//#define RUN_CONVERTFP16
+static Size convertFP16_srcSize(4000, 4000);
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 ///////////// CopyTo ////////////////////////

 typedef Size_MatType CopyToFixture;
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@ -80,52 +80,133 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
    return k.run(2, globalsize, NULL, false);
 }
-#endif

-void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
+static bool ocl_convertTo(InputArray src_, OutputArray dst_, int ddepth, bool noScale, double alpha, double beta)
 {
    CV_INSTRUMENT_REGION();

-    if( empty() )
+    CV_Assert(ddepth >= 0);
+
+    int stype = src_.type();
+    int sdepth = CV_MAT_DEPTH(stype);
+    int cn = CV_MAT_CN(stype);
+
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
+    int wdepth = (sdepth == CV_64F) ? CV_64F : CV_32F;
+
+    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
+    bool doubleCheck = true;
+    if (needDouble)
    {
-        _dst.release();
-        return;
+        doubleCheck = ocl::Device::getDefault().hasFP64();
+    }
+    bool halfCheck = true;
+    bool needHalf = sdepth == CV_16F || ddepth == CV_16F;
+    if (needHalf)
+    {
+        halfCheck = ocl::Device::getDefault().hasFP16();
    }

-    bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
+    if (!doubleCheck)
+        return false;
+    if (!halfCheck)
+        return false;

-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : type();
+    const int rowsPerWI = 4;
+
+    char cvt[2][50];
+    ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
+                  format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s -D rowsPerWI=%d%s%s%s",
+                         ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
+                         ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
+                         ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
+                         rowsPerWI,
+                         needDouble ? " -D DOUBLE_SUPPORT" : "",
+                         needHalf ? " -D HALF_SUPPORT" : "",
+                         noScale ? " -D NO_SCALE" : ""
+                  )
+    );
+
+    if (k.empty())
+        return false;
+
+    UMat src = src_.getUMat();
+    dst_.createSameSize(src_, dtype);
+    UMat dst = dst_.getUMat();
+
+    float alphaf = (float)alpha, betaf = (float)beta;
+
+    if (noScale)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn));
+    else if (wdepth == CV_32F)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
    else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alpha, beta);

-    int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
+    size_t globalsize[2] = {
+        (size_t)dst.cols * cn,
+        divUp((size_t)dst.rows, rowsPerWI)
+    };
+    if (!k.run(2, globalsize, NULL, false))
+        return false;
+
+    CV_IMPL_ADD(CV_IMPL_OCL);
+    return true;
+}
+#endif
+
+void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
+{
+    CV_INSTRUMENT_REGION();
+
+    if (empty())
    {
-        copyTo(_dst);
+        dst.release();
        return;
    }

+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);
+
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
+    else
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;
+
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
+    {
+        copyTo(dst);
+        return;
+    }
+
+    CV_OCL_RUN(dims <= 2 && dst.isUMat(),
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+
+    int cn = channels();
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
    Mat src = *this;
    bool allowTransposed = dims == 1 ||
-        _dst.kind() == _InputArray::STD_VECTOR ||
-        (_dst.fixedSize() && _dst.dims() == 1);
-    _dst.create( dims, size, _type, -1, allowTransposed );
-    Mat dst = _dst.getMat();
+        dst.kind() == _InputArray::STD_VECTOR ||
+        (dst.fixedSize() && dst.dims() == 1);
+    dst.create( dims, size, dtype, -1, allowTransposed );
+    Mat dstMat = dst.getMat();

    BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
    double scale[] = {alpha, beta};
-    int cn = channels();
    CV_Assert( func != 0 );

    if( dims <= 2 )
    {
-        Size sz = getContinuousSize2D(src, dst, cn);
-        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
+        Size sz = getContinuousSize2D(src, dstMat, cn);
+        func(src.data, src.step, 0, 0, dstMat.data, dstMat.step, sz, scale);
    }
    else
    {
-        const Mat* arrays[] = {&src, &dst, 0};
+        const Mat* arrays[] = {&src, &dstMat, 0};
        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        Size sz((int)(it.size*cn), 1);
@ -135,6 +216,44 @@ void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) cons
    }
 }

+void UMat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
+{
+    CV_INSTRUMENT_REGION();
+
+    if (empty())
+    {
+        dst.release();
+        return;
+    }
+
+#ifdef HAVE_OPENCL
+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);
+
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
+    else
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;
+
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
+    {
+        copyTo(dst);
+        return;
+    }
+
+    CV_OCL_RUN(dims <= 2,
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+#endif // HAVE_OPENCL
+
+    UMat src = *this;  // Fake reference to itself.
+                       // Resolves issue 8693 in case of src == dst.
+    Mat m = getMat(ACCESS_READ);
+    m.convertTo(dst, type_, alpha, beta);
+    (void)src;
+}
+
 //==================================================================================================

 void convertFp16(InputArray _src, OutputArray _dst)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -1604,6 +1604,9 @@ struct Device::Impl
            pos = pos2 + 1;
        }

+        khr_fp64_support_ = isExtensionSupported("cl_khr_fp64");
+        khr_fp16_support_ = isExtensionSupported("cl_khr_fp16");
+
        intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");

        vendorName_ = getStrProp(CL_DEVICE_VENDOR);
@ -1692,7 +1695,9 @@ struct Device::Impl
    String version_;
    std::string extensions_;
    int doubleFPConfig_;
+    bool khr_fp64_support_;
    int halfFPConfig_;
+    bool khr_fp16_support_;
    bool hostUnifiedMemory_;
    int maxComputeUnits_;
    size_t maxWorkGroupSize_;
@ -1844,6 +1849,11 @@ int Device::singleFPConfig() const
 int Device::halfFPConfig() const
 { return p ? p->halfFPConfig_ : 0; }

+bool Device::hasFP64() const
+{ return p ? p->khr_fp64_support_ : false; }
+bool Device::hasFP16() const
+{ return p ? p->khr_fp16_support_ : false; }
+
 bool Device::endianLittle() const
 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }

--- a/modules/core/src/ocl_disabled.impl.hpp
+++ b/modules/core/src/ocl_disabled.impl.hpp
@ -67,6 +67,9 @@ int Device::doubleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::singleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::halfFPConfig() const { OCL_NOT_AVAILABLE(); }

+bool Device::hasFP64() const { OCL_NOT_AVAILABLE(); }
+bool Device::hasFP16() const { OCL_NOT_AVAILABLE(); }
+
 bool Device::endianLittle() const { OCL_NOT_AVAILABLE(); }
 bool Device::errorCorrectionSupport() const { OCL_NOT_AVAILABLE(); }

--- a/modules/core/src/opencl/convert.cl
+++ b/modules/core/src/opencl/convert.cl
@ -49,14 +49,21 @@
 #endif
 #endif

+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
+
 #define noconvert

 __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
-                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
 #ifndef NO_SCALE
-                        WT alpha, WT beta,
+                        , WT alpha, WT beta
 #endif
-                        int rowsPerWI)
+)
 {
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -1268,70 +1268,10 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
    src.copyTo(_dst, _mask);
 }

-void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
-{
-    CV_INSTRUMENT_REGION();

-    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
-    int stype = type(), cn = CV_MAT_CN(stype);
-
-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : stype;
-    else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), cn);
-
-    int sdepth = CV_MAT_DEPTH(stype), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
-    {
-        copyTo(_dst);
-        return;
-    }
-#ifdef HAVE_OPENCL
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
-    if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
-            ((needDouble && doubleSupport) || !needDouble) )
-    {
-        int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
-
-        char cvt[2][50];
-        ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
-                      format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s%s",
-                             ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
-                             ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
-                             ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "", noScale ? " -D NO_SCALE" : ""));
-        if (!k.empty())
-        {
-            UMat src = *this;
-            _dst.create( size(), _type );
-            UMat dst = _dst.getUMat();
-
-            float alphaf = (float)alpha, betaf = (float)beta;
-            ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-                    dstarg = ocl::KernelArg::WriteOnly(dst, cn);
-
-            if (noScale)
-                k.args(srcarg, dstarg, rowsPerWI);
-            else if (wdepth == CV_32F)
-                k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
-            else
-                k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
-
-            size_t globalsize[2] = { (size_t)dst.cols * cn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
-            if (k.run(2, globalsize, NULL, false))
-            {
-                CV_IMPL_ADD(CV_IMPL_OCL);
-                return;
-            }
-        }
-    }
-#endif
-    UMat src = *this;  // Fake reference to itself.
-                       // Resolves issue 8693 in case of src == dst.
-    Mat m = getMat(ACCESS_READ);
-    m.convertTo(_dst, _type, alpha, beta);
-}
+//
+// void UMat::convertTo moved to convert.dispatch.cpp
+//

 UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@ -34,7 +34,7 @@
 #include <errno.h>
 #include <io.h>
 #include <stdio.h>
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@ -194,7 +194,7 @@ cv::String getcwd()
    sz = GetCurrentDirectoryA((DWORD)buf.size(), buf.data());
    return cv::String(buf.data(), (size_t)sz);
 #endif
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
    for(;;)
    {
        char* p = ::getcwd(buf.data(), buf.size());
@ -228,7 +228,7 @@ bool createDirectory(const cv::String& path)
 #else
    int result = _mkdir(path.c_str());
 #endif
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
    int result = mkdir(path.c_str(), 0777);
 #else
    int result = -1;
@ -343,7 +343,7 @@ private:
    Impl& operator=(const Impl&); // disabled
 };

-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__

 struct FileLock::Impl
 {
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -848,6 +848,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
+#ifdef HAVE_VULKAN
+INSTANTIATE_TEST_CASE_P(VULKAN, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)));
+#endif
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -33,7 +33,7 @@ public:
        weights = findDataFile(weights, false);
        if (!proto.empty())
            proto = findDataFile(proto);
-        net = readNet(proto, weights);
+        net = readNet(weights, proto);
        // Set multiple inputs
        for(auto &inp: inputs){
            net.setInput(std::get<0>(inp), std::get<1>(inp));
@ -214,7 +214,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
    Mat sample = imread(findDataFile("dnn/dog416.png"));
    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
-    processNet("", "dnn/yolov5n.onnx", inp);
+    processNet("dnn/yolov5n.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, YOLOv8)
@ -226,7 +226,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv8)

    Mat sample = imread(findDataFile("dnn/dog416.png"));
    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
-    processNet("", "dnn/yolov8n.onnx", inp);
+    processNet("dnn/yolov8n.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, YOLOX) {
@ -236,7 +236,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOX) {
    );
    Mat sample = imread(findDataFile("dnn/dog416.png"));
    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
-    processNet("", "dnn/yolox_s.onnx", inp);
+    processNet("dnn/yolox_s.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
@ -250,7 +250,7 @@ PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
 {
    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);

-    processNet("", "dnn/mosaic-9.onnx", cv::Size(224, 224));
+    processNet("dnn/mosaic-9.onnx", "", cv::Size(224, 224));
 }

 PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
@ -291,15 +291,15 @@ PERF_TEST_P_(DNNTestNetwork, EfficientNet)
    Mat sample = imread(findDataFile("dnn/dog416.png"));
    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
    transposeND(inp, {0, 2, 3, 1}, inp);
-    processNet("", "dnn/efficientnet-lite4.onnx", inp);
+    processNet("dnn/efficientnet-lite4.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, YuNet) {
-    processNet("", "dnn/onnx/models/yunet-202303.onnx", cv::Size(640, 640));
+    processNet("dnn/onnx/models/yunet-202303.onnx", "", cv::Size(640, 640));
 }

 PERF_TEST_P_(DNNTestNetwork, SFace) {
-    processNet("", "dnn/face_recognition_sface_2021dec.onnx", cv::Size(112, 112));
+    processNet("dnn/face_recognition_sface_2021dec.onnx", "", cv::Size(112, 112));
 }

 PERF_TEST_P_(DNNTestNetwork, MPPalm) {
@ -307,7 +307,7 @@ PERF_TEST_P_(DNNTestNetwork, MPPalm) {
    randu(inp, 0.0f, 1.0f);
    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
    transposeND(inp, {0, 2, 3, 1}, inp);
-    processNet("", "dnn/palm_detection_mediapipe_2023feb.onnx", inp);
+    processNet("dnn/palm_detection_mediapipe_2023feb.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, MPHand) {
@ -315,7 +315,7 @@ PERF_TEST_P_(DNNTestNetwork, MPHand) {
    randu(inp, 0.0f, 1.0f);
    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
    transposeND(inp, {0, 2, 3, 1}, inp);
-    processNet("", "dnn/handpose_estimation_mediapipe_2023feb.onnx", inp);
+    processNet("dnn/handpose_estimation_mediapipe_2023feb.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, MPPose) {
@ -323,23 +323,23 @@ PERF_TEST_P_(DNNTestNetwork, MPPose) {
    randu(inp, 0.0f, 1.0f);
    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
    transposeND(inp, {0, 2, 3, 1}, inp);
-    processNet("", "dnn/pose_estimation_mediapipe_2023mar.onnx", inp);
+    processNet("dnn/pose_estimation_mediapipe_2023mar.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
-    processNet("", "dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", cv::Size(736, 736));
+    processNet("dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", "", cv::Size(736, 736));
 }

 PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
-    processNet("", "dnn/human_segmentation_pphumanseg_2023mar.onnx", cv::Size(192, 192));
+    processNet("dnn/human_segmentation_pphumanseg_2023mar.onnx", "", cv::Size(192, 192));
 }

 PERF_TEST_P_(DNNTestNetwork, CRNN) {
    Mat inp(cv::Size(100, 32), CV_32FC1);
    randu(inp, 0.0f, 1.0f);
    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
-    processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", inp);
+    processNet("dnn/text_recognition_CRNN_EN_2021sep.onnx", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, VitTrack) {
@ -349,7 +349,7 @@ PERF_TEST_P_(DNNTestNetwork, VitTrack) {
    randu(inp2, 0.0f, 1.0f);
    inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
    inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
-    processNet("", "dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
+    processNet("dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", "", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
 }

 PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
@ -360,14 +360,14 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
    }
    Mat inp = imread(findDataFile("dnn/dog416.png"));
    inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
-    processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", inp);
+    processNet("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", "", inp);
 }

 PERF_TEST_P_(DNNTestNetwork, VIT_B_32)
 {
    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);

-    processNet("", "dnn/onnx/models/vit_b_32.onnx", cv::Size(224, 224));
+    processNet("dnn/onnx/models/vit_b_32.onnx", "", cv::Size(224, 224));
 }

 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@ -279,8 +279,8 @@ public:
                // Half precision floats.
                CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());

-                Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
-                convertFp16(halfs, dstBlob);
+                Mat halfs((int)shape.size(), &shape[0], CV_16FC1, (void*)raw_data.c_str());
+                halfs.convertTo(dstBlob, CV_32F);
            }
            else if (pbBlob.raw_data_type() == caffe::FLOAT)
            {
--- a/modules/dnn/src/caffe/caffe_shrinker.cpp
+++ b/modules/dnn/src/caffe/caffe_shrinker.cpp
@ -44,8 +44,8 @@ void shrinkCaffeModel(const String& src, const String& dst, const std::vector<St
            CV_Assert(blob->data_size() != 0);  // float32 array.

            Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
-            Mat halfs(1, blob->data_size(), CV_16SC1);
-            convertFp16(floats, halfs);  // Convert to float16.
+            Mat halfs(1, blob->data_size(), CV_16FC1);
+            floats.convertTo(halfs, CV_16F);  // Convert to float16.

            blob->clear_data();  // Clear float32 data.

--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@ -502,7 +502,7 @@ void InfEngineNgraphNet::init(Target targetId)
                    size_t total = ngraph::shape_size(constant->get_shape());
                    Mat floats(1, total, CV_32F, (void*)floatsData);
                    Mat halfs;
-                    cv::convertFp16(floats, halfs);
+                    floats.convertTo(halfs, CV_16F);

                    auto new_const = std::make_shared<ngraph::op::Constant>(ngraph::element::f16, constant->get_shape(), halfs.data);
                    new_const->set_friendly_name(constant->get_friendly_name());
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@ -135,10 +135,10 @@ public:
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);

-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
        {
            UMat inputFp32;
-            convertFp16(inputs[0], inputFp32);
+            inputs[0].convertTo(inputFp32, CV_32F);
            inputs[0] = inputFp32;  // replace
        }

@ -264,10 +264,7 @@ public:
        UMat outputFp32;
        inputs[0].convertTo(outputFp32, CV_32F, scales[0], -(scales[0]*zeropoints[0]));

-        if (outputs_.depth() == CV_16S)
-            convertFp16(outputFp32, outputs[0]);
-        else
-            outputFp32.copyTo(outputs[0]);
+        outputFp32.convertTo(outputs[0], outputs_.depth());
        return true;
    }
 #endif
--- a/modules/dnn/src/layer.cpp
+++ b/modules/dnn/src/layer.cpp
@ -165,7 +165,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
+    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16F)
    {
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
@ -181,7 +181,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays

        inputs.resize(orig_inputs.size());
        for (size_t i = 0; i < orig_inputs.size(); i++)
-            convertFp16(orig_inputs[i], inputs[i]);
+            orig_inputs[i].convertTo(inputs[i], CV_32F);

        outputs.resize(orig_outputs.size());
        for (size_t i = 0; i < orig_outputs.size(); i++)
@ -194,7 +194,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
        forward(inputs, outputs, internals);

        for (size_t i = 0; i < outputs.size(); i++)
-            convertFp16(outputs[i], orig_outputs[i]);
+            outputs[i].convertTo(orig_outputs[i], CV_16F);

        // sync results back
        outputs_arr.assign(orig_outputs);
--- a/modules/dnn/src/layer_internals.hpp
+++ b/modules/dnn/src/layer_internals.hpp
@ -146,7 +146,7 @@ struct DataLayer : public Layer
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        bool isFP16 = outputs_arr.depth() == CV_16S;
+        bool isFP16 = outputs_arr.depth() == CV_16F;

        std::vector<Mat> outputs, internals;
        outputs_arr.getMatVector(outputs);
@ -159,7 +159,7 @@ struct DataLayer : public Layer

            CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
            if (isFP16)
-                CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
+                CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
            else
                CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");

@ -175,7 +175,7 @@ struct DataLayer : public Layer
                {
                    Mat input_f32;
                    inputsData[i].convertTo(input_f32, CV_32F, scale, -mean[0] * scale);
-                    convertFp16(input_f32, outputs[i]);
+                    input_f32.convertTo(outputs[i], CV_16F);
                }
                else
                {
@ -194,7 +194,7 @@ struct DataLayer : public Layer
                        {
                            Mat input_f32;
                            inp.convertTo(input_f32, CV_32F, scale, -mean[c] * scale);
-                            convertFp16(input_f32, out);
+                            input_f32.convertTo(out, CV_16F);
                        }
                        else
                        {
@ -209,7 +209,7 @@ struct DataLayer : public Layer
 #ifdef HAVE_OPENCL
    bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
-        bool isFP16 = outputs_.depth() == CV_16S;
+        bool isFP16 = outputs_.depth() == CV_16F;

        std::vector<UMat> outputs;
        outputs_.getUMatVector(outputs);
@ -223,7 +223,7 @@ struct DataLayer : public Layer

            CV_Assert(mean == Scalar() || inputData.size[1] <= 4);
            if (isFP16)
-                CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
+                CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
            else
                CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");

@ -239,7 +239,7 @@ struct DataLayer : public Layer
                {
                    UMat input_i;
                    inputData.convertTo(input_i, CV_32F, scale, -mean[0] * scale);
-                    convertFp16(input_i, outputs[i]);
+                    input_i.convertTo(outputs[i], CV_16F);
                }
                else
                {
@ -263,7 +263,7 @@ struct DataLayer : public Layer
                        {
                            UMat input_i;
                            inp.convertTo(input_i, CV_32F, scale, -mean[c] * scale);
-                            convertFp16(input_i, out);
+                            input_i.convertTo(out, CV_16F);
                        }
                        else
                        {
--- a/modules/dnn/src/layers/attention_layer.cpp
+++ b/modules/dnn/src/layers/attention_layer.cpp
@ -106,7 +106,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -190,7 +190,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inputs_.depth() == CV_16S);
+        bool use_half = (inputs_.depth() == CV_16F);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);

@ -264,7 +264,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -163,14 +163,14 @@ public:
            for( i = 0; i < ninputs; i++ )
            {
                Mat& inp = inputs[i];
-                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16F || inp.type() == CV_8S) &&
                           inp.dims == 4 && inp.size[0] == output.size[0] &&
                           inp.size[2] == output.size[2] &&
                           inp.size[3] == output.size[3] );
                nchannels += inp.size[1];
            }
            CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16F || output.type() == CV_8S) );

            cc.chptrs.resize(nchannels*batchsz);

@ -221,7 +221,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@ -62,12 +62,12 @@ public:
    {
        std::vector<UMat> outputs;
        outs.getUMatVector(outputs);
-        if (outs.depth() == CV_16S) {
+        if (outs.depth() == CV_16F) {
            auto blob = blobs[0];
            if (blob.type() != CV_32F) {
                blob.convertTo(blob, CV_32F);
            }
-            convertFp16(blob, outputs[0]);
+            blob.convertTo(outputs[0], CV_16F);
        }
        else
            blobs[0].convertTo(outputs[0], outputs[0].type());
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -139,7 +139,7 @@ public:
        }

        const Mat &input = inputs[0];
-        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
+        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16F));
        for (size_t i = 0; i < outputs.size(); i++)
        {
            CV_Assert(inputs[i].type() == input.type());
@ -929,7 +929,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -943,6 +943,7 @@ public:
            umat_blobs.resize(n);
            for (size_t i = 0; i < n; i++)
            {
+                CV_Assert(!use_half);  // TODO: not implemented
                inputs[i + 1].copyTo(umat_blobs[i]);
            }
            inputs.resize(1);
@ -955,7 +956,7 @@ public:
            for (size_t i = 0; i < n; i++)
            {
                if (use_half)
-                    convertFp16(blobs[i], umat_blobs[i]);
+                    blobs[i].convertTo(umat_blobs[i], CV_16F);
                else
                    blobs[i].copyTo(umat_blobs[i]);
            }
@ -1036,7 +1037,7 @@ public:
        if (fusedWeights)
        {
            if (use_half)
-                convertFp16(weightsMat, umat_blobs[0]);
+                weightsMat.convertTo(umat_blobs[0], CV_16F);
            else
                weightsMat.copyTo(umat_blobs[0]);
            fusedWeights = false;
@ -1046,7 +1047,7 @@ public:
            if ( umat_blobs.size() < 2 )
                umat_blobs.resize(2);
            if (use_half)
-                convertFp16(Mat(biasvec, true), umat_blobs[1]);
+                Mat(biasvec, true).convertTo(umat_blobs[1], CV_16F);
            else
                Mat(biasvec, true).copyTo(umat_blobs[1]);
            convolutionOp->setBias(true);
@ -1109,7 +1110,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -1789,7 +1790,7 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
            return false;

        inputs_.getUMatVector(inputs);
@ -1896,7 +1897,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr));

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
@ -313,8 +313,7 @@ static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, cha
    const float* b = (const float*)b_;
    float* c = (float*)c_;

-    __m256i dummy;
-    __m256 s00  = (__m256)__lasx_xvxor_v(dummy, dummy), s01  = s00,
+    __m256 s00  = _v256_setall_ps(0), s01  = s00,
           s10  = s00, s11  = s00,
           s20  = s00, s21  = s00,
           s30  = s00, s31  = s00,
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@ -55,7 +55,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/cumsum_layer.cpp
+++ b/modules/dnn/src/layers/cumsum_layer.cpp
@ -37,7 +37,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -337,7 +337,7 @@ public:
        std::vector<UMat> outputs;
        outs.getUMatVector(outputs);

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        if (use_half)
        {
            std::vector<UMat> orig_inputs;
@ -345,7 +345,7 @@ public:

            inputs.resize(orig_inputs.size());
            for (size_t i = 0; i < orig_inputs.size(); i++)
-                convertFp16(orig_inputs[i], inputs[i]);
+                orig_inputs[i].convertTo(inputs[i], CV_32F);
        }
        else
        {
@ -410,7 +410,7 @@ public:
        if (use_half)
        {
            UMat half_umat;
-            convertFp16(umat, half_umat);
+            umat.convertTo(half_umat, CV_16F);
            outs.assign(std::vector<UMat>(1, half_umat));
        }

@ -428,7 +428,7 @@ public:
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
        }
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@ -454,7 +454,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -210,7 +210,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget),
                   func.applyOCL(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -587,7 +587,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
+        if ((inputs_.depth() == CV_16F && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
            return false;

        if (hasVecInput)
@ -607,7 +607,7 @@ public:
                        size_t localsize[] = { 128 };
                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
                        String opts;
-                        if (inputs_.depth() == CV_16S)
+                        if (inputs_.depth() == CV_16F)
                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
                        else
                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
@ -633,7 +633,7 @@ public:
                    }
                    else
                    {
-                        if (inputs_.depth() == CV_16S)
+                        if (inputs_.depth() == CV_16F)
                            return false;

                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
@ -686,7 +686,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/expand_layer.cpp
+++ b/modules/dnn/src/layers/expand_layer.cpp
@ -105,7 +105,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -355,7 +355,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -383,9 +383,9 @@ public:

                if (use_half)
                {
-                    convertFp16(A, A_fp32);
-                    convertFp16(B, B_fp32);
-                    convertFp16(C, C_fp32);
+                    A.convertTo(A_fp32, CV_32F);
+                    B.convertTo(B_fp32, CV_32F);
+                    C.convertTo(C_fp32, CV_32F);
                }
                else
                {
@ -396,9 +396,9 @@ public:
                cv::gemm(A_fp32, B_fp32, 1, noArray(), 0, C_fp32);
                if (use_half)
                {
-                    convertFp16(A_fp32, A);
-                    convertFp16(B_fp32, B);
-                    convertFp16(C_fp32, C);
+                    A_fp32.convertTo(A, CV_16F);
+                    B_fp32.convertTo(B, CV_16F);
+                    C_fp32.convertTo(C, CV_16F);
                }
            }
            return true;
@ -429,7 +429,7 @@ public:
                for (int i = 0; i < umat_blobs.size(); i++)
                {
                    if (!umat_blobs[i].empty())
-                        convertFp16(umat_blobs[i], half_blobs[i]);
+                        umat_blobs[i].convertTo(half_blobs[i], CV_16F);
                }
            }

@ -470,8 +470,8 @@ public:

            if (use_half)
            {
-                convertFp16(srcMat, srcMat_fp32);
-                convertFp16(dstMat, dstMat_fp32);
+                srcMat.convertTo(srcMat_fp32, CV_32F);
+                dstMat.convertTo(dstMat_fp32, CV_32F);
            }
            else
            {
@ -489,8 +489,8 @@ public:
            }
            if (use_half)
            {
-                convertFp16(srcMat_fp32, srcMat);
-                convertFp16(dstMat_fp32, dstMat);
+                srcMat_fp32.convertTo(srcMat, CV_16F);
+                dstMat_fp32.convertTo(dstMat, CV_16F);
            }
        }

@ -506,7 +506,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && !isMatMul,
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/gather_elements_layer.cpp
+++ b/modules/dnn/src/layers/gather_elements_layer.cpp
@ -70,7 +70,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/gather_layer.cpp
+++ b/modules/dnn/src/layers/gather_layer.cpp
@ -57,12 +57,12 @@ public:
        const Mat& inp = inputs[0];

        int indicesType = inputs[1].type();
-        CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16SC1, "");
+        CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16FC1, "");
        Mat indices32S;
-        if (indicesType == CV_16S/*FP16*/)
+        if (indicesType == CV_16F/*FP16*/)
        {
            Mat indicesF32;
-            convertFp16(inputs[1], indicesF32);
+            inputs[1].convertTo(indicesF32, CV_32F);
            indicesF32.convertTo(indices32S, CV_32S);
        }
        else
--- a/modules/dnn/src/layers/gemm_layer.cpp
+++ b/modules/dnn/src/layers/gemm_layer.cpp
@ -172,7 +172,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/group_norm_layer.cpp
+++ b/modules/dnn/src/layers/group_norm_layer.cpp
@ -60,7 +60,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S) {
+        if (inputs_arr.depth() == CV_16F) {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
        }
@ -95,7 +95,7 @@ public:
        float inv_norm_size = 1.f / norm_size;

        // no fp16 support
-        if (input.depth() == CV_16S) {
+        if (input.depth() == CV_16F) {
            return false;
        }

--- a/modules/dnn/src/layers/instance_norm_layer.cpp
+++ b/modules/dnn/src/layers/instance_norm_layer.cpp
@ -73,7 +73,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -107,7 +107,7 @@ public:
        float inv_norm_size = 1.f / norm_size;

        // no fp16 support
-        if (input.depth() == CV_16S) {
+        if (input.depth() == CV_16F) {
            return false;
        }

--- a/modules/dnn/src/layers/layer_norm.cpp
+++ b/modules/dnn/src/layers/layer_norm.cpp
@ -99,7 +99,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -140,7 +140,7 @@ public:
        const auto &bias = inputs.size() == 3 ? inputs[2] : UMat::zeros(norm_size, 1, CV_32F);

        // no fp16 support
-        if (input.depth() == CV_16S) {
+        if (input.depth() == CV_16F) {
            return false;
        }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -119,7 +119,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -164,7 +164,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/matmul_layer.cpp
+++ b/modules/dnn/src/layers/matmul_layer.cpp
@ -119,7 +119,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -154,7 +154,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inputs_arr.depth() == CV_16S);
+        bool use_half = (inputs_arr.depth() == CV_16F);
        inputs_arr.getUMatVector(inputs);
        outputs_arr.getUMatVector(outputs);

@ -192,9 +192,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
            }

            if (use_half) {
-                convertFp16(A, A_fp32);
-                convertFp16(B, B_fp32);
-                convertFp16(C, C_fp32);
+                A.convertTo(A_fp32, CV_32F);
+                B.convertTo(B_fp32, CV_32F);
+                C.convertTo(C_fp32, CV_32F);
            } else {
                A_fp32 = A;
                B_fp32 = B;
@ -203,9 +203,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {

            cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
            if (use_half) {
-                convertFp16(A_fp32, A);
-                convertFp16(B_fp32, B);
-                convertFp16(C_fp32, C);
+                A_fp32.convertTo(A, CV_16F);
+                B_fp32.convertTo(B, CV_16F);
+                C_fp32.convertTo(C, CV_16F);
            }
        }
        return true;
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@ -73,7 +73,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -149,7 +149,7 @@ public:
        UMat& bnorm_bias = umat_shift;

        const unsigned LOCAL_SIZE = 128;
-        bool use_half = (inputs[0].depth() == CV_16S);
+        bool use_half = (inputs[0].depth() == CV_16F);
        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
                             LOCAL_SIZE
@ -164,7 +164,7 @@ public:
            CV_Assert(newRows != 0);

            MatShape s = shape(newRows, inpMat.total() / newRows);
-            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16F : CV_32F);
            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
            float alpha = 1.0f / s[1];

@ -226,7 +226,7 @@ public:
        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
            return fast_forward_ocl(inputs, outputs);

-        if (inputs[0].depth() == CV_16S)
+        if (inputs[0].depth() == CV_16F)
            return false;

        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
@ -309,7 +309,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@ -7,6 +7,7 @@
 #include "../op_cuda.hpp"
 #include "../op_cann.hpp"
 #include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"

 #include <opencv2/dnn/shape_utils.hpp>

@ -34,8 +35,163 @@ static int _mod(int x, int y) {
 }
 }

+class NaryEltwiseHelper CV_FINAL
+{
+public:
+    int ninputs;
+    int narrays;
+    int max_ndims;
+    std::vector<int> all_ndims;
+    std::vector<std::vector<int>> orig_shapes;
+    std::vector<std::vector<size_t>> orig_steps;
+    std::vector<char*> ptrs;
+    std::vector<std::vector<int>> shapes;
+    std::vector<std::vector<size_t>> steps;
+    std::vector<size_t> elemsize;
+
+    NaryEltwiseHelper() {
+    }
+
+    void init(const std::vector<Mat>& inputs, const std::vector<Mat>& outputs)
+    {
+        narrays = 0;
+        max_ndims = 0;
+        all_ndims.clear();
+        orig_shapes.clear();
+        orig_steps.clear();
+        ptrs.clear();
+        shapes.clear();
+        steps.clear();
+        elemsize.clear();
+
+        ninputs = inputs.size();
+        narrays = ninputs + 1;
+
+        // collect ndims
+        std::vector<int> v_inp_dims;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
+        const int* inp_ndims = v_inp_dims.data();
+        int out_ndims = outputs[0].dims;
+
+        // find max ndims for broadcasting
+        int i;
+        max_ndims = out_ndims > 2 ? out_ndims : 2;
+        for(i = 0; i < ninputs; i++)
+            max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
+
+        shapes = std::vector<std::vector<int>>(narrays, std::vector<int>(max_ndims, 0));
+        steps = std::vector<std::vector<size_t>>(narrays, std::vector<size_t>(max_ndims, 0));
+        ptrs = std::vector<char*>(narrays, nullptr);
+
+        for(i = 0; i <= ninputs; i++) {
+            all_ndims.push_back(i == 0 ? out_ndims : inp_ndims[i-1]);
+            std::vector<int> _size;
+            std::vector<size_t> _step;
+            if (!i) {
+                std::transform(outputs[0].size.p, outputs[0].size.p + outputs[0].dims, std::back_inserter(_size), [](int s) { return s; });
+                std::transform(outputs[0].step.p, outputs[0].step.p + outputs[0].dims, std::back_inserter(_step), [](size_t s) { return s; });
+            }
+            else {
+                std::transform(inputs[i-1].size.p, inputs[i-1].size.p + inputs[i-1].dims, std::back_inserter(_size), [](int s) { return s; });
+                std::transform(inputs[i-1].step.p, inputs[i-1].step.p + inputs[i-1].dims, std::back_inserter(_step), [](size_t s) { return s; });
+            }
+            orig_shapes.push_back(_size);
+            orig_steps.push_back(_step);
+
+            int esz = i == 0 ? outputs[0].elemSize() : inputs[i - 1].elemSize();
+            elemsize.push_back(esz);
+        }
+    }
+
+    void reInit(size_t newElemSize) {
+        std::vector<size_t> newElemSizes(elemsize.size(), newElemSize);
+        reInit(newElemSizes);
+    }
+
+    void reInit(std::vector<size_t> newElemSizes) {
+        for (size_t array_index = 0; array_index < orig_steps.size(); array_index++) {
+            auto &step = orig_steps[array_index];
+            int esz = elemsize[array_index];
+            int new_esz = newElemSizes[array_index];
+            for (size_t step_index = 0; step_index < step.size(); step_index++) {
+                step[step_index] = static_cast<size_t>(step[step_index] / esz * new_esz);
+            }
+            elemsize[array_index] = newElemSizes[array_index];
+        }
+        prepare_for_broadcast_op();
+    }
+
+    bool prepare_for_broadcast_op()
+    {
+        int i, j, k;
+
+        // step 1.
+        // * make all inputs and the output max_ndims-dimensional.
+        // ** prepend dimension 1 to the mat of less dims
+        // * compute proper step's
+        for (i = this->max_ndims-1; i >= 0; i--) {
+            for (k = 0; k < this->narrays; k++) {
+                j = this->all_ndims[k] - (this->max_ndims - i);
+                int sz_i = j >= 0 ? this->orig_shapes[k][j] : 1;
+                size_t st_i = j >= 0 && this->orig_steps[k][j] > 0 ? this->orig_steps[k][j] :
+                    i == this->max_ndims-1 ? elemsize[k] : this->steps[k][i+1]*this->shapes[k][i+1];
+                assert(st_i % elemsize[k] == 0);
+                this->shapes[k][i] = sz_i;
+                this->steps[k][i] = st_i;
+                if (this->shapes[k][i] == 0)
+                    return false;
+            }
+        }
+
+        // step 3. Let's do the flattening first,
+        // since we'd need proper values of steps to check continuity.
+        // this loop is probably the most tricky part
+        // in the whole implementation of broadcasting.
+        j = this->max_ndims > 0 ? this->max_ndims-1 : 0;
+        for (i = j - 1; i >= 0; i--) {
+            bool all_contiguous = true, all_scalars = true, all_consistent = true;
+            for(k = 0; k < this->narrays; k++) {
+                size_t st = this->steps[k][j]*this->shapes[k][j];
+                bool prev_scalar = this->shapes[k][j] == 1;
+                bool scalar = this->shapes[k][i] == 1;
+                all_contiguous = all_contiguous && (st == this->steps[k][i]);
+                all_scalars = all_scalars && scalar;
+                all_consistent = all_consistent && (scalar == prev_scalar);
+            }
+            if (all_contiguous && (all_consistent || all_scalars)) {
+                for(k = 0; k < this->narrays; k++)
+                    this->shapes[k][j] *= this->shapes[k][i];
+            } else {
+                j--;
+                if (i < j) {
+                    for(k = 0; k < this->narrays; k++) {
+                        this->shapes[k][j] = this->shapes[k][i];
+                        this->steps[k][j] = this->steps[k][i];
+                    }
+                }
+            }
+        }
+
+        // step 2. Set some step's to 0's.
+        for (i = this->max_ndims-1; i >= j; i--) {
+            for (k = 0; k < this->narrays; k++)
+                this->steps[k][i] = this->shapes[k][i] == 1 ? 0 : this->steps[k][i];
+        }
+        if (this->max_ndims == 0)
+            i = 0;
+        for (; i >= 0; i--) {
+            for (k = 0; k < this->narrays; k++) {
+                this->steps[k][i] = 0;
+                this->shapes[k][i] = 1;
+            }
+        }
+        return true;
+    }
+};
+
 class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 {
+    NaryEltwiseHelper helper;
 public:
    enum class OPERATION
    {
@ -130,6 +286,13 @@ public:
                    op == OPERATION::MOD ||
                    op == OPERATION::FMOD
            );
+
+#ifdef HAVE_VULKAN
+        if (backendId == DNN_BACKEND_VKCOM)
+            return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
+                   op == OPERATION::DIV ;
+#endif
+
        if (backendId == DNN_BACKEND_CUDA) {
            return op == OPERATION::MAX  || op == OPERATION::MIN  || op == OPERATION::SUM ||
                   op == OPERATION::PROD || op == OPERATION::DIV  || op == OPERATION::ADD ||
@ -166,74 +329,14 @@ public:
        return outShape;
    }

-    static bool prepare_for_broadcast_op(
-        int narrays, int max_ndims, const size_t* elemsize,
-        const int* ndims, const int** shape_, const size_t** step_,
-        int** shape, size_t** step)
-    {
-        int i, j, k;

-        // step 1.
-        // * make all inputs and the output max_ndims-dimensional.
-        // ** prepend dimension 1 to the mat of less dims
-        // * compute proper step's
-        for (i = max_ndims-1; i >= 0; i-- ) {
-            for (k = 0; k < narrays; k++) {
-                j = ndims[k] - (max_ndims - i);
-                int sz_i = j >= 0 ? shape_[k][j] : 1;
-                size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] :
-                    i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1];
-                assert(st_i % elemsize[k] == 0);
-                shape[k][i] = sz_i;
-                step[k][i] = st_i;
-                if (shape[k][i] == 0)
-                    return false;
-            }
-        }
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);

-        // step 3. Let's do the flattening first,
-        // since we'd need proper values of steps to check continuity.
-        // this loop is probably the most tricky part
-        // in the whole implementation of broadcasting.
-        j = max_ndims > 0 ? max_ndims-1 : 0;
-        for (i = j - 1; i >= 0; i--) {
-            bool all_contiguous = true, all_scalars = true, all_consistent = true;
-            for(k = 0; k < narrays; k++) {
-                size_t st = step[k][j]*shape[k][j];
-                bool prev_scalar = shape[k][j] == 1;
-                bool scalar = shape[k][i] == 1;
-                all_contiguous = all_contiguous && (st == step[k][i]);
-                all_scalars = all_scalars && scalar;
-                all_consistent = all_consistent && (scalar == prev_scalar);
-            }
-            if (all_contiguous && (all_consistent || all_scalars)) {
-                for(k = 0; k < narrays; k++)
-                    shape[k][j] *= shape[k][i];
-            } else {
-                j--;
-                if (i < j) {
-                    for(k = 0; k < narrays; k++) {
-                        shape[k][j] = shape[k][i];
-                        step[k][j] = step[k][i];
-                    }
-                }
-            }
-        }
-
-        // step 2. Set some step's to 0's.
-        for (i = max_ndims-1; i >= j; i--) {
-            for (k = 0; k < narrays; k++)
-                step[k][i] = shape[k][i] == 1 ? 0 : step[k][i];
-        }
-        if (max_ndims == 0)
-            i = 0;
-        for (; i >= 0; i--) {
-            for (k = 0; k < narrays; k++) {
-                step[k][i] = 0;
-                shape[k][i] = 1;
-            }
-        }
-        return true;
+        helper.init(inputs, outputs);
+        CV_Assert(helper.prepare_for_broadcast_op());
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -248,10 +351,10 @@ public:

    template <typename T, typename Functor>
    void binary_forward_impl(
-            int ndims, const int* shape,
-            const char* data1, const size_t* step1,
-            const char* data2, const size_t* step2,
-            char* data, const size_t* step,
+            int ndims, const std::vector<int>& shape,
+            const char* data1, const std::vector<size_t>& step1,
+            const char* data2, const std::vector<size_t>& step2,
+            char* data, const std::vector<size_t>& step,
            const Functor& op)
    {
        size_t dp1 = 0, dp2 = 0, dp = 0;
@ -320,52 +423,18 @@ public:
        const Mat& a = inputs[0];
        const Mat& b = inputs[1];
        Mat& out = outputs[0];
-
-        // collect info of inputs and output
-        const int* in_shape[] = {a.size.p, b.size.p};
-        const size_t* in_step[] = {a.step.p, b.step.p};
-        const int* out_shape = out.size.p;
-        const size_t* out_step = out.step.p;
-        const int in_ndims[] = {a.dims, b.dims};
-        int out_ndims = out.dims;
-
-        int max_ndims = std::max(a.dims, std::max(b.dims, out.dims));
-
-        const int* orig_shapes[3];
-        int shapes_[3][CV_MAX_DIM];
-        int* shapes[] = {shapes_[0], shapes_[1], shapes_[2]};
-        const size_t* orig_steps[3];
-        size_t steps_[3][CV_MAX_DIM];
-        size_t* steps[] = {steps_[0], steps_[1], steps_[2]};
-        int all_ndims[3];
-        size_t all_type_sizes[3];
-
-        // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
-        for (int i = 0; i < 3; i++)
-        {
-            orig_shapes[i] = (const int*)(i == 0 ? out_shape : in_shape[i-1]);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
-            all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-        }
-
-        if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;
-
+        CV_Assert(helper.shapes.size() == 3 && helper.steps.size() == 3);
        binary_forward_impl<T, Functor>(
-                max_ndims, shapes[0], a.ptr<char>(), steps[1],
-                b.ptr<char>(), steps[2], out.ptr<char>(), steps[0],
+                helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1],
+                b.ptr<char>(), helper.steps[2], out.ptr<char>(), helper.steps[0],
                f);
    }

    template<typename T, typename Functor>
    void nary_forward_impl(
-        const Functor& f, const T scale, int ninputs, int ndims, const int* shape,
+        const Functor& f, const T scale, int ninputs, int ndims, const std::vector<int>& shape,
        const char** inp, char* out,
-        const size_t** steps, char** ptrs)
+        const std::vector<std::vector<size_t>>& steps, std::vector<char*>& ptrs)
    {
        CV_Assert(ndims >= 2);
        size_t dp = steps[0][ndims-1]/sizeof(T);
@ -450,77 +519,16 @@ public:
        const std::vector<Mat>& inputs, std::vector<Mat>& outputs
        )
    {
-        int ninputs = inputs.size();
-
-        // collect all input
+        // collect all input info
        std::vector<const char*> v_inp;
        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr<const char>(); });
        const char** inp = v_inp.data();

-        // collect ndims of all input
-        std::vector<int> v_inp_dims;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
-        const int* inp_ndims = v_inp_dims.data();
-
-        // collect shapes of all input
-        std::vector<const int*> v_inp_shape;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; });
-        const int** inp_shape = v_inp_shape.data();
-
-        // collect steps of all input
-        std::vector<const size_t*> v_inp_step;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; });
-        const size_t** inp_step = v_inp_step.data();
-
-        // collect info of output (ndims, shape, step)
+        // collect output info
        char* out = outputs[0].ptr<char>();
-        int out_ndims = outputs[0].dims;
-        const int* out_shape = outputs[0].size.p;
-        const size_t* out_step = outputs[0].step.p;
-
-        // find max ndims for broadcasting
-        int i, max_ndims = out_ndims > 2 ? out_ndims : 2;
-        for(i = 0; i < ninputs; i++)
-            max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
-
-        // buf holds the following buffers for inputs & output:
-        //  * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total
-        //  * ptrs, (ninputs+1)*1 elements in total
-        //  * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total
-        //  * all_ndims, (ninputs+1)*1 elements in total
-        //  * all_type_sizes, (ninputs+1)*1 elements in total
-        AutoBuffer<size_t> buf((ninputs + 1) * (2 * max_ndims + 7));
-
-        int** orig_shapes = (int**)buf.data();
-        int** shapes = orig_shapes + ninputs + 1;
-        size_t** orig_steps = (size_t**)(shapes + ninputs + 1);
-        size_t** steps = orig_steps + ninputs + 1;
-
-        char** ptrs = (char**)(steps + ninputs + 1);
-
-        size_t* step_buf = (size_t*)(ptrs + ninputs + 1);
-        int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims);
-
-        int* all_ndims = shape_buf + (ninputs + 1)*max_ndims;
-        size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1);
-
-        for(i = 0; i <= ninputs; i++) {
-            all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-            orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0);
-            shapes[i] = shape_buf + max_ndims*i;
-            steps[i] = step_buf + max_ndims*i;
-        }
-
-        if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;

        nary_forward_impl<T>(
-                f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs);
+                f, scale, helper.ninputs, helper.max_ndims, helper.shapes[0], inp, out, helper.steps, helper.ptrs);
    }

    template <typename T, typename Functor>
@ -531,59 +539,21 @@ public:
        const Mat& c = inputs[2];
        Mat& out = outputs[0];

-        // collect info of inputs and output
-        const int* in_shape[] = {a.size.p, b.size.p, c.size.p};
-        const size_t* in_step[] = {a.step.p, b.step.p, c.step.p};
-        const int* out_shape = out.size.p;
-        const size_t* out_step = out.step.p;
-        const int in_ndims[] = {a.dims, b.dims, c.dims};
-        int out_ndims = out.dims;
-
-        int max_ndims = std::max(a.dims, std::max(b.dims, std::max(c.dims, out.dims)));
-
-        AutoBuffer<size_t> buf(4 * (2 * max_ndims + 6));
-
-        int** orig_shapes = (int**)(buf.data());
-        int** shapes = orig_shapes + 4;
-        size_t** orig_steps = (size_t**)(shapes + 4);
-        size_t** steps = orig_steps + 4;
-
-        int* shape_buf = (int*)(steps + 4);
-        size_t* step_buf = (size_t*)(shape_buf + 4 * max_ndims);
-
-        int* all_ndims = (int*)(step_buf + 4 * max_ndims);
-        size_t* all_type_sizes = (size_t*)(all_ndims + 4);
-
-        // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
-        for (int i = 0; i < 4; i++)
-        {
-            orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
-            shapes[i] = shape_buf + i * max_ndims;
-            steps[i] = step_buf + i * max_ndims;
-            all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-        }
-
-        if (!prepare_for_broadcast_op(4, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;
+        CV_Assert(helper.shapes.size() == 4 && helper.steps.size() == 4);

        trinary_forward_impl<T, Functor>(
-                max_ndims, shapes[0], a.ptr<char>(), steps[1], b.ptr<char>(), steps[2],
-                c.ptr<char>(), steps[3], out.ptr<char>(), steps[0],
+                helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1], b.ptr<char>(), helper.steps[2],
+                c.ptr<char>(), helper.steps[3], out.ptr<char>(), helper.steps[0],
                f);
    }

    template <typename T, typename Functor>
    void trinary_forward_impl(
-            int ndims, const int* shape,
-            const char* data1, const size_t* step1,
-            const char* data2, const size_t* step2,
-            const char* data3, const size_t* step3,
-            char* data, const size_t* step,
+            int ndims, const std::vector<int>& shape,
+            const char* data1, const std::vector<size_t>& step1,
+            const char* data2, const std::vector<size_t>& step2,
+            const char* data3, const std::vector<size_t>& step3,
+            char* data, const std::vector<size_t>& step,
            const Functor& op)
    {
        assert(ndims >= 2);
@ -642,8 +612,9 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
+            helper.reInit(sizeof(float));
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
        }
@ -798,9 +769,13 @@ public:
        switch (type)
        {
            case CV_8U:
+                // TODO: integrate with type inference
+                helper.reInit(sizeof(uint8_t));
                opDispatch<uint8_t>(std::forward<Args>(args)...);
                break;
            case CV_32S:
+                // TODO: integrate with type inference
+                helper.reInit(sizeof(int32_t));
                opDispatch<int32_t>(std::forward<Args>(args)...);
                break;
            case CV_32F:
@ -958,6 +933,16 @@ public:
        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
    }
 #endif
+
+#ifdef HAVE_VULKAN
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                       std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE
+    {
+        Ptr<vkcom::OpBase> op = makePtr<vkcom::OpNary>((vkcom::OpNary::OPERATION) this->op, helper.ninputs, helper.max_ndims, helper.shapes, helper.steps);
+        return Ptr<BackendNode>(makePtr<VkComBackendNode>(inputs, op, outputs));
+    }
+#endif
+
 };

 Ptr<NaryEltwiseLayer> NaryEltwiseLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -112,7 +112,7 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
            return false;

        inputs_.getUMatVector(inputs);
@ -193,7 +193,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@ -127,17 +127,7 @@ public:

        if (paddingType == "constant")
        {
-            if (inputs_arr.depth() == CV_16S)
-            {
-                std::vector<float> paddingValue_fp32(1, paddingValue);
-                std::vector<int16_t> paddingValue_fp16(1);
-                cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
-                outputs[0].setTo(paddingValue_fp16[0]);
-            }
-            else if (inputs_arr.depth() == CV_8S)
-                outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
-            else
-                outputs[0].setTo(paddingValue);
+            outputs[0].setTo(paddingValue);
            inputs[0].copyTo(outputs[0](dstRanges));
        }
        else if (paddingType == "reflect" || paddingType == "edge")
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -319,7 +319,7 @@ public:
            mnew_stride.copyTo(unew_stride);
        }

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        String opts = format("-DDtype=%s", use_half ? "half" : "float");
        for (size_t i = 0; i < inputs.size(); i++)
        {
@ -350,7 +350,7 @@ public:
                   inputs_arr.depth() != CV_8S,
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -278,7 +278,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -338,7 +338,7 @@ public:
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
        }
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -346,7 +346,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -431,7 +431,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@ -186,7 +186,7 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
            return false;

        inputs_.getUMatVector(inputs);
@ -269,7 +269,7 @@ public:
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@ -390,7 +390,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -906,7 +906,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
@ -1066,7 +1066,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/reduce_layer.cpp
+++ b/modules/dnn/src/layers/reduce_layer.cpp
@ -456,7 +456,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@ -161,7 +161,7 @@ public:
        std::vector<UMat> outputs;

        // TODO: implement a logistic activation to classification scores.
-        if (useLogistic || inps.depth() == CV_16S)
+        if (useLogistic || inps.depth() == CV_16F)
            return false;

        inps.getUMatVector(inputs);
@ -232,7 +232,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@ -184,7 +184,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -115,7 +115,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -105,7 +105,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/scatterND_layer.cpp
+++ b/modules/dnn/src/layers/scatterND_layer.cpp
@ -74,7 +74,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S) {
+        if (inputs_arr.depth() == CV_16F) {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
        }
--- a/modules/dnn/src/layers/scatter_layer.cpp
+++ b/modules/dnn/src/layers/scatter_layer.cpp
@ -68,7 +68,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S) {
+        if (inputs_arr.depth() == CV_16F) {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
        }
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@ -107,7 +107,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -621,7 +621,7 @@ public:
            {
                std::vector<int> inpIdx(dimsNum, 0);
                std::vector<int> outIdx(dimsNum, 0);
-                if (inpMat.type() == CV_16S)
+                if (inpMat.type() == CV_16F)
                    getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
                else if (inpMat.type() == CV_8S)
                    getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -130,7 +130,7 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

-        bool use_half = (inputs_.depth() == CV_16S);
+        bool use_half = (inputs_.depth() == CV_16F);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);
@ -215,7 +215,7 @@ public:
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@ -501,7 +501,7 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes)
    CV_Assert(layerShapesIt != layersShapes.end());

    if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
-        ld.dtype = CV_16S;
+        ld.dtype = CV_16F;

    std::vector<LayerPin> pinsForInternalBlobs;
    blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
@ -559,7 +559,7 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
            preferableTarget == DNN_TARGET_OPENCL_FP16 &&
            layers[0].dtype == CV_32F)
        {
-            layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
+            layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F);
        }
        inputShapes.push_back(shape(inp));
    }
@ -643,8 +643,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                    {
                        UMat& u = umat_outputBlobs[i];
                        Mat m;
-                        if (u.depth() == CV_16S)  // FP16
-                            convertFp16(u, m);
+                        if (u.depth() == CV_16F)  // FP16
+                            u.convertTo(m, CV_32F);
                        else
                            m = u.getMat(ACCESS_READ);
                        if (!checkRange(m))
@ -666,8 +666,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                        {
                            UMat& u = umat_inputBlobs[i];
                            Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                            else
                                m = u.getMat(ACCESS_READ);
                            std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
@ -677,8 +677,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                        {
                            UMat& u = umat_outputBlobs[i];
                            Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                            else
                                m = u.getMat(ACCESS_READ);
                            std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
@ -688,8 +688,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                        {
                            UMat& u = umat_internalBlobs[i];
                            Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                            else
                                m = u.getMat(ACCESS_READ);
                            std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
@ -964,12 +964,12 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
                ld.outputBlobsWrappers[i]->copyToHost();
            }
        }
-        if (ld.outputBlobs[0].depth() == CV_16S)
+        if (ld.outputBlobs[0].depth() == CV_16F)
        {
            std::vector<Mat>& outputvec = *(std::vector<Mat>*)outputBlobs.getObj();
            outputvec.resize(ld.outputBlobs.size());
            for (int i = 0; i < outputvec.size(); i++)
-                convertFp16(ld.outputBlobs[i], outputvec[i]);
+                ld.outputBlobs[i].convertTo(outputvec[i], CV_32F);
        }
        else
        {
@ -992,7 +992,7 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
                std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
                outputvec.resize(out_vec.size());
                for (int i = 0; i < out_vec.size(); i++)
-                    convertFp16(out_vec[i], outputvec[i]);
+                    out_vec[i].convertTo(outputvec[i], CV_32F);
            }
        }
        else
@ -1258,7 +1258,7 @@ void Net::Impl::updateLayersShapes()
            preferableTarget == DNN_TARGET_OPENCL_FP16 &&
            inputLayerData.dtype == CV_32F)
        {
-            inp.create(inp.dims, inp.size, CV_16S);
+            inp.create(inp.dims, inp.size, CV_16F);
        }
        inputShapes.push_back(shape(inp));
    }
@ -1327,10 +1327,10 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
        ld.outputBlobsWrappers[pin.oid]->copyToHost();
    }

-    if (ld.outputBlobs[pin.oid].depth() == CV_16S)
+    if (ld.outputBlobs[pin.oid].depth() == CV_16F)
    {
        Mat output_blob;
-        convertFp16(ld.outputBlobs[pin.oid], output_blob);
+        ld.outputBlobs[pin.oid].convertTo(output_blob, CV_32F);
        return output_blob;
    }
    else
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@ -156,7 +156,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
             gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;

-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);

    if (is_image_a)
    {
@ -439,7 +439,7 @@ static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
             << "Invalid fast buffer gemm type." << std::endl;

-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);

    size_t sub_group_size = 8;
    bool is_small_batch = (M == 2 || M == 4 || M == 8);
@ -544,7 +544,7 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                       const UMat B_image, UMat C,
                       const size_t max_image_size)
 {
-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);
    gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;

    if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
@ -594,7 +594,7 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
                 const int32_t offy)
 {
    bool ret = false;
-    bool use_half = (A.depth() == CV_16S);
+    bool use_half = (A.depth() == CV_16F);
    String opts;
    if (use_half)
        opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
@ -665,7 +665,7 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
                 const UMat X, const int32_t offX, UMat Y,
                 const int32_t offY)
 {
-    bool use_half = (X.depth() == CV_16S);
+    bool use_half = (X.depth() == CV_16F);
    String opts;
    if (use_half)
        opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -582,10 +582,10 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
    }

    if (use_half_ && !bias.empty())
-        CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
+        CV_CheckTypeEQ(bias.type(), CV_16FC1, "");

    if (use_half_)
-        CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
+        CV_CheckTypeEQ(weight.type(), CV_16FC1, "");

    prepareKernel(bottom, top, weight, bias, numImages);
    if (bestKernelConfig.empty())
@ -740,7 +740,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
    if (swizzled_weights_umat.empty())
        swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
                                     kernel_h_ * (int)alignSize(kernel_w_, 2),
-                                     (use_half_) ? CV_16SC1 : CV_32FC1);
+                                     (use_half_) ? CV_16FC1 : CV_32FC1);

    if (!interleave) {
        int32_t channels = channels_ / group_;
@ -777,8 +777,8 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
        UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
        if (use_half_)
        {
-            CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
-            convertFp16(weight, weight_tmp);
+            CV_CheckTypeEQ(weight.type(), CV_16FC1, "");
+            weight.convertTo(weight_tmp, CV_32F);
            weightMat = weight_tmp.getMat(ACCESS_READ);
            swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
        }
@ -817,7 +817,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
        weightMat.release();

        if (use_half_)
-            convertFp16(swizzledWeightMat, swizzled_weights_umat);
+            swizzledWeightMat.convertTo(swizzled_weights_umat, CV_16F);
    }

    return true;
@ -1140,7 +1140,7 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,

    //int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
    CV_CheckEQ(top.total(), (size_t)numImages * num_output_ * output_h_ * output_w_, "");
-    CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16SC1 : CV_32FC1, "");
+    CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16FC1 : CV_32FC1, "");
    top.setTo(Scalar::all(0));

    bool saved_tuned = tuned_;
@ -1154,8 +1154,8 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
    Mat mat_top, mat_verify_top;
    if (use_half_)
    {
-        convertFp16(top, new_top);
-        convertFp16(verifyTop, new_verify_top);
+        top.convertTo(new_top, CV_32F);
+        verifyTop.convertTo(new_verify_top, CV_32F);

        mat_top = new_top.getMat(ACCESS_READ);
        mat_verify_top = new_verify_top.getMat(ACCESS_READ);
@ -1827,7 +1827,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
    if (loadTunedConfig())  // check external storage
        return;

-    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
+    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16FC1 : CV_32FC1);

    calculateBenchmark(bottom, benchData, weight, bias, numImages);

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@ -102,10 +102,10 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
                UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
                UMat newbias, tmpTop;

-                convertFp16(bias, newbias);
-                convertFp16(top, tmpTop);
+                bias.convertTo(newbias, CV_32F);
+                top.convertTo(tmpTop, CV_32F);
                cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
-                convertFp16(tmpTop, top);
+                tmpTop.convertTo(top, CV_16F);
            } else {
                UMat biasOnesMat = UMat::ones(M_, 1, CV_32F);
                cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0);
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -2447,7 +2447,7 @@ void ONNXImporter::parseCast(LayerParams& layerParams, const opencv_onnx::NodePr
            case opencv_onnx::TensorProto_DataType_FLOAT:   type = CV_32F; break;
            case opencv_onnx::TensorProto_DataType_UINT8:   type = CV_8U; break;
            case opencv_onnx::TensorProto_DataType_UINT16:  type = CV_16U; break;
-            case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16S; break;
+            case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16F; break;
            case opencv_onnx::TensorProto_DataType_INT8:
            case opencv_onnx::TensorProto_DataType_INT16:
            case opencv_onnx::TensorProto_DataType_INT32:
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@ -915,22 +915,22 @@ Mat getTensorContentRef_(const tensorflow::TensorProto& tensor)
        }
        case tensorflow::DT_HALF:
        {
-            Mat halfs;
            if (!content.empty())
            {
                static const int kHalfSize = 2;
-                halfs = Mat(1, content.size() / kHalfSize, CV_16UC1, (void*)content.c_str());
+                Mat halfs(1, content.size() / kHalfSize, CV_16FC1, (void*)content.c_str());
+                halfs.convertTo(m, CV_32F);
            }
            else
            {
                const RepeatedField<int32_t>& field = tensor.half_val();
                CV_Assert(!field.empty());
                Mat ints(1, field.size(), CV_32SC1, (void*)field.data());
+                Mat halfs;
                ints.convertTo(halfs, CV_16UC1);
+                Mat halfsSigned(halfs.size(), CV_16FC1, halfs.data);
+                halfsSigned.convertTo(m, CV_32F);
            }
-            // Reinterpret as a signed shorts just for a convertFp16 call.
-            Mat halfsSigned(halfs.size(), CV_16SC1, halfs.data);
-            convertFp16(halfsSigned, m);
            break;
        }
        case tensorflow::DT_QUINT8:
--- a/modules/dnn/src/tflite/tflite_importer.cpp
+++ b/modules/dnn/src/tflite/tflite_importer.cpp
@ -101,7 +101,7 @@ Mat TFLiteImporter::parseTensor(const Tensor& tensor)
        dtype = CV_32S;
        break;
    case TensorType_FLOAT16:
-        dtype = CV_16S;
+        dtype = CV_16F;
        break;
    case TensorType_INT8:
        dtype = CV_8S;
@ -227,7 +227,7 @@ void TFLiteImporter::populateNet()
                if (!data.empty()) {
                    // Dequantize a buffer
                    Mat dataFP32;
-                    convertFp16(data, dataFP32);
+                    data.convertTo(dataFP32, CV_32F);
                    allTensors[op_outputs->Get(0)] = dataFP32;
                    continue;
                }
--- a/modules/dnn/src/vkcom/include/op_naryeltwise.hpp
+++ b/modules/dnn/src/vkcom/include/op_naryeltwise.hpp
@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_OP_NARY_HPP
+#define OPENCV_OP_NARY_HPP
+
+#include "vkcom.hpp"
+#include "op_base.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+#ifdef HAVE_VULKAN
+
+enum NaryShaderType
+{
+    kNaryShaderTypeBinary,
+    kNaryShaderTypeTrinary,
+    kNaryShaderTypeNary,
+    kNaryShaderTest,
+};
+
+struct NaryShaderConfig
+{
+    int local_size_x;
+    int local_size_y;
+    int local_size_z;
+};
+
+
+class OpNary : public OpBase
+{
+public:
+    // Copied from nary_eltwise_layers.cpp
+    enum class OPERATION
+    {
+        AND = 0,
+        EQUAL,
+        GREATER,
+        GREATER_EQUAL,
+        LESS,
+        LESS_EQUAL,
+        OR,
+        POW,
+        XOR,
+        BITSHIFT,
+        MAX,
+        MEAN,
+        MIN,
+        MOD,
+        PROD,
+        SUB,
+        SUM,
+        ADD,
+        DIV,
+        WHERE,
+    };
+
+    OpNary(const OPERATION naryOpType, int ninputs, int max_ndims, const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps);
+
+    void firstForward(); // Execute only in the first forward.
+    virtual bool forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs) CV_OVERRIDE;
+    Ptr<Tensor> weightTensorPtr;
+private:
+    bool computeGroupCount();
+    bool binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+    bool trinaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+    bool naryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+
+    const OPERATION naryOpType;
+    NaryShaderType shaderType;
+    NaryShaderConfig config;
+    int ninputs;
+    int max_ndims;
+    AutoBuffer<int32_t> shapesBuf;
+    AutoBuffer<int32_t> stepsBuf;
+    int nplanes; // number of planes computations are to be performed on
+    int N2; // value of shape[ndims - 2]
+    int N1; // value of shape[ndims - 1]
+
+    bool firstForwardFinsh = false;
+};
+
+#endif // HAVE_VULKAN
+
+}}} // namespace cv::dnn::vkcom
+#endif //OPENCV_OP_MATMUL_HPP
--- a/modules/dnn/src/vkcom/include/vkcom.hpp
+++ b/modules/dnn/src/vkcom/include/vkcom.hpp
@ -51,5 +51,6 @@ bool isAvailable();
 #include "op_base.hpp"
 #include "op_conv.hpp"
 #include "op_matmul.hpp"
+#include "op_naryeltwise.hpp"

 #endif // OPENCV_DNN_VKCOM_HPP
--- a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
+++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
@ -0,0 +1,116 @@
+#version 450
+// #extension GL_EXT_debug_printf : enable
+#define ALL_THREAD 1024
+// #define ALL_THREAD 128 // Experimental batched operation
+#define STEP_SIZE 65536
+
+layout(binding = 0) readonly buffer Input1{
+    float matA[];
+};
+
+layout(binding = 1) readonly buffer Input2{
+    float matB[];
+};
+
+layout(binding = 2) writeonly buffer Output{
+    float matOut[];
+};
+
+layout(binding = 3) uniform Params {
+    int opType;
+    int ndims;
+} params;
+
+layout(binding = 4) readonly buffer Shape {
+    int shape[];
+};
+
+layout(binding = 5) readonly buffer Step {
+    int matStep[];
+};
+
+/* local_size_x, local_size_y, local_size_z there defines the number of invocations
+   of this compute shader in the current work group. */
+// TODO: Check if this makes any sense
+// TODO: Check if it is required to fetch PhysicalDeviceLimit from Context
+// TODO: here we shall assume that maxGroupInvocation is 1024.
+layout(local_size_x = ALL_THREAD, local_size_y = 1, local_size_z = 1) in; // TODO: Check if this makes any sense
+
+const int AND = 0;
+const int EQUAL = 1;
+const int GREATER = 2;
+const int GREATER_EQUAL = 3;
+const int LESS = 4;
+const int LESS_EQUAL = 5;
+const int OR = 6;
+const int POW = 7;
+const int XOR = 8;
+const int BITSHIFT = 9;
+const int MAX = 10;
+const int MEAN = 11;
+const int MIN = 12;
+const int MOD = 13;
+const int FMOD = 14;
+const int PROD = 15;
+const int SUB = 16;
+const int SUM = 17;
+const int ADD = 18;
+const int DIV = 19;
+const int WHERE = 20;
+
+void binary_forward()
+{
+    int ndims = params.ndims;
+    int dp1 = matStep[2 * ndims - 1];
+    int dp2 = matStep[3 * ndims - 1];
+    int dp = matStep[ndims - 1];
+    int n1 = shape[ndims - 1], n2 = shape[ndims - 2];
+
+    int plane_idx = int(gl_WorkGroupID.x);
+
+    int ptr1 = 0;
+    int ptr2 = 0;
+    int ptr = 0;
+    int idx = plane_idx;
+
+    for (int k = ndims - 3; k >= 0; --k) {
+        int next_idx = idx / shape[k];
+        int i_k = idx - next_idx * shape[k]; // i_k = idx % shape[k]
+        ptr1 += i_k * matStep[ndims + k];
+        ptr2 += i_k * matStep[2 * ndims + k];
+        ptr += i_k * matStep[k];
+        idx = next_idx;
+    }
+
+    int i2_offset = int(gl_WorkGroupID.y);
+    int i1_offset = int(gl_LocalInvocationID.x);
+
+    ptr1 += i2_offset * matStep[2 * ndims - 2];
+    ptr2 += i2_offset * matStep[3 * ndims - 2];
+    ptr += i2_offset * matStep[ndims - 2];
+
+    for (int i1 = i1_offset; i1 < n1; i1 += ALL_THREAD) {
+        switch (params.opType) {
+            case int(ADD):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] + matB[ptr2 + i1 * dp2];
+                break;
+            case int(SUB):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] - matB[ptr2 + i1 * dp2];
+                break;
+            case int(PROD):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] * matB[ptr2 + i1 * dp2];
+                break;
+            case int(DIV):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] / matB[ptr2 + i1 * dp2];
+                break;
+        }
+    }
+}
+
+
+void main()
+{
+    // debugPrintfEXT("nary_eltwise_binary_forward.comp loaded\n");
+    binary_forward();
+    return;
+}
--- a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
+++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
@ -0,0 +1,232 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+extern const unsigned int nary_eltwise_binary_forward_spv[1757] = {
+    0x07230203,0x00010000,0x0008000b,0x00000131,0x00000000,0x00020011,0x00000001,0x0006000b,
+    0x00000001,0x4c534c47,0x6474732e,0x3035342e,0x00000000,0x0003000e,0x00000000,0x00000001,
+    0x0007000f,0x00000005,0x00000004,0x6e69616d,0x00000000,0x0000003c,0x00000083,0x00060010,
+    0x00000004,0x00000011,0x00000400,0x00000001,0x00000001,0x00030003,0x00000002,0x000001c2,
+    0x00040005,0x00000004,0x6e69616d,0x00000000,0x00060005,0x00000006,0x616e6962,0x665f7972,
+    0x6177726f,0x00286472,0x00040005,0x0000000a,0x6d69646e,0x00000073,0x00040005,0x0000000b,
+    0x61726150,0x0000736d,0x00050006,0x0000000b,0x00000000,0x7954706f,0x00006570,0x00050006,
+    0x0000000b,0x00000001,0x6d69646e,0x00000073,0x00040005,0x0000000d,0x61726170,0x0000736d,
+    0x00030005,0x00000012,0x00317064,0x00040005,0x00000014,0x70657453,0x00000000,0x00050006,
+    0x00000014,0x00000000,0x5374616d,0x00706574,0x00030005,0x00000016,0x00000000,0x00030005,
+    0x0000001e,0x00327064,0x00030005,0x00000025,0x00007064,0x00030005,0x0000002a,0x0000316e,
+    0x00040005,0x0000002c,0x70616853,0x00000065,0x00050006,0x0000002c,0x00000000,0x70616873,
+    0x00000065,0x00030005,0x0000002e,0x00000000,0x00030005,0x00000033,0x0000326e,0x00050005,
+    0x00000038,0x6e616c70,0x64695f65,0x00000078,0x00060005,0x0000003c,0x575f6c67,0x476b726f,
+    0x70756f72,0x00004449,0x00040005,0x00000042,0x31727470,0x00000000,0x00040005,0x00000043,
+    0x32727470,0x00000000,0x00030005,0x00000044,0x00727470,0x00030005,0x00000045,0x00786469,
+    0x00030005,0x00000047,0x0000006b,0x00050005,0x00000052,0x7478656e,0x7864695f,0x00000000,
+    0x00030005,0x00000058,0x006b5f69,0x00050005,0x0000007d,0x6f5f3269,0x65736666,0x00000074,
+    0x00050005,0x00000082,0x6f5f3169,0x65736666,0x00000074,0x00080005,0x00000083,0x4c5f6c67,
+    0x6c61636f,0x6f766e49,0x69746163,0x44496e6f,0x00000000,0x00030005,0x000000a1,0x00003169,
+    0x00040005,0x000000b4,0x7074754f,0x00007475,0x00050006,0x000000b4,0x00000000,0x4f74616d,
+    0x00007475,0x00030005,0x000000b6,0x00000000,0x00040005,0x000000bd,0x75706e49,0x00003174,
+    0x00050006,0x000000bd,0x00000000,0x4174616d,0x00000000,0x00030005,0x000000bf,0x00000000,
+    0x00040005,0x000000c9,0x75706e49,0x00003274,0x00050006,0x000000c9,0x00000000,0x4274616d,
+    0x00000000,0x00030005,0x000000cb,0x00000000,0x00050048,0x0000000b,0x00000000,0x00000023,
+    0x00000000,0x00050048,0x0000000b,0x00000001,0x00000023,0x00000004,0x00030047,0x0000000b,
+    0x00000002,0x00040047,0x0000000d,0x00000022,0x00000000,0x00040047,0x0000000d,0x00000021,
+    0x00000003,0x00040047,0x00000013,0x00000006,0x00000004,0x00040048,0x00000014,0x00000000,
+    0x00000018,0x00050048,0x00000014,0x00000000,0x00000023,0x00000000,0x00030047,0x00000014,
+    0x00000003,0x00040047,0x00000016,0x00000022,0x00000000,0x00040047,0x00000016,0x00000021,
+    0x00000005,0x00040047,0x0000002b,0x00000006,0x00000004,0x00040048,0x0000002c,0x00000000,
+    0x00000018,0x00050048,0x0000002c,0x00000000,0x00000023,0x00000000,0x00030047,0x0000002c,
+    0x00000003,0x00040047,0x0000002e,0x00000022,0x00000000,0x00040047,0x0000002e,0x00000021,
+    0x00000004,0x00040047,0x0000003c,0x0000000b,0x0000001a,0x00040047,0x00000083,0x0000000b,
+    0x0000001b,0x00040047,0x000000b3,0x00000006,0x00000004,0x00040048,0x000000b4,0x00000000,
+    0x00000019,0x00050048,0x000000b4,0x00000000,0x00000023,0x00000000,0x00030047,0x000000b4,
+    0x00000003,0x00040047,0x000000b6,0x00000022,0x00000000,0x00040047,0x000000b6,0x00000021,
+    0x00000002,0x00040047,0x000000bc,0x00000006,0x00000004,0x00040048,0x000000bd,0x00000000,
+    0x00000018,0x00050048,0x000000bd,0x00000000,0x00000023,0x00000000,0x00030047,0x000000bd,
+    0x00000003,0x00040047,0x000000bf,0x00000022,0x00000000,0x00040047,0x000000bf,0x00000021,
+    0x00000000,0x00040047,0x000000c8,0x00000006,0x00000004,0x00040048,0x000000c9,0x00000000,
+    0x00000018,0x00050048,0x000000c9,0x00000000,0x00000023,0x00000000,0x00030047,0x000000c9,
+    0x00000003,0x00040047,0x000000cb,0x00000022,0x00000000,0x00040047,0x000000cb,0x00000021,
+    0x00000001,0x00040047,0x0000011f,0x0000000b,0x00000019,0x00020013,0x00000002,0x00030021,
+    0x00000003,0x00000002,0x00040015,0x00000008,0x00000020,0x00000001,0x00040020,0x00000009,
+    0x00000007,0x00000008,0x0004001e,0x0000000b,0x00000008,0x00000008,0x00040020,0x0000000c,
+    0x00000002,0x0000000b,0x0004003b,0x0000000c,0x0000000d,0x00000002,0x0004002b,0x00000008,
+    0x0000000e,0x00000001,0x00040020,0x0000000f,0x00000002,0x00000008,0x0003001d,0x00000013,
+    0x00000008,0x0003001e,0x00000014,0x00000013,0x00040020,0x00000015,0x00000002,0x00000014,
+    0x0004003b,0x00000015,0x00000016,0x00000002,0x0004002b,0x00000008,0x00000017,0x00000000,
+    0x0004002b,0x00000008,0x00000018,0x00000002,0x0004002b,0x00000008,0x0000001f,0x00000003,
+    0x0003001d,0x0000002b,0x00000008,0x0003001e,0x0000002c,0x0000002b,0x00040020,0x0000002d,
+    0x00000002,0x0000002c,0x0004003b,0x0000002d,0x0000002e,0x00000002,0x00040015,0x00000039,
+    0x00000020,0x00000000,0x00040017,0x0000003a,0x00000039,0x00000003,0x00040020,0x0000003b,
+    0x00000001,0x0000003a,0x0004003b,0x0000003b,0x0000003c,0x00000001,0x0004002b,0x00000039,
+    0x0000003d,0x00000000,0x00040020,0x0000003e,0x00000001,0x00000039,0x00020014,0x00000050,
+    0x0004002b,0x00000039,0x0000007e,0x00000001,0x0004003b,0x0000003b,0x00000083,0x00000001,
+    0x00030016,0x000000b2,0x00000020,0x0003001d,0x000000b3,0x000000b2,0x0003001e,0x000000b4,
+    0x000000b3,0x00040020,0x000000b5,0x00000002,0x000000b4,0x0004003b,0x000000b5,0x000000b6,
+    0x00000002,0x0003001d,0x000000bc,0x000000b2,0x0003001e,0x000000bd,0x000000bc,0x00040020,
+    0x000000be,0x00000002,0x000000bd,0x0004003b,0x000000be,0x000000bf,0x00000002,0x00040020,
+    0x000000c5,0x00000002,0x000000b2,0x0003001d,0x000000c8,0x000000b2,0x0003001e,0x000000c9,
+    0x000000c8,0x00040020,0x000000ca,0x00000002,0x000000c9,0x0004003b,0x000000ca,0x000000cb,
+    0x00000002,0x0004002b,0x00000008,0x00000119,0x00000400,0x0004002b,0x00000039,0x0000011e,
+    0x00000400,0x0006002c,0x0000003a,0x0000011f,0x0000011e,0x0000007e,0x0000007e,0x0004002b,
+    0x00000008,0x00000120,0x00000004,0x0004002b,0x00000008,0x00000121,0x00000005,0x0004002b,
+    0x00000008,0x00000122,0x00000006,0x0004002b,0x00000008,0x00000123,0x00000007,0x0004002b,
+    0x00000008,0x00000124,0x00000008,0x0004002b,0x00000008,0x00000125,0x00000009,0x0004002b,
+    0x00000008,0x00000126,0x0000000a,0x0004002b,0x00000008,0x00000127,0x0000000b,0x0004002b,
+    0x00000008,0x00000128,0x0000000c,0x0004002b,0x00000008,0x00000129,0x0000000d,0x0004002b,
+    0x00000008,0x0000012a,0x0000000e,0x0004002b,0x00000008,0x0000012b,0x0000000f,0x0004002b,
+    0x00000008,0x0000012c,0x00000010,0x0004002b,0x00000008,0x0000012d,0x00000011,0x0004002b,
+    0x00000008,0x0000012e,0x00000012,0x0004002b,0x00000008,0x0000012f,0x00000013,0x0004002b,
+    0x00000008,0x00000130,0x00000014,0x00050036,0x00000002,0x00000004,0x00000000,0x00000003,
+    0x000200f8,0x00000005,0x00040039,0x00000002,0x0000011c,0x00000006,0x000100fd,0x00010038,
+    0x00050036,0x00000002,0x00000006,0x00000000,0x00000003,0x000200f8,0x00000007,0x0004003b,
+    0x00000009,0x0000000a,0x00000007,0x0004003b,0x00000009,0x00000012,0x00000007,0x0004003b,
+    0x00000009,0x0000001e,0x00000007,0x0004003b,0x00000009,0x00000025,0x00000007,0x0004003b,
+    0x00000009,0x0000002a,0x00000007,0x0004003b,0x00000009,0x00000033,0x00000007,0x0004003b,
+    0x00000009,0x00000038,0x00000007,0x0004003b,0x00000009,0x00000042,0x00000007,0x0004003b,
+    0x00000009,0x00000043,0x00000007,0x0004003b,0x00000009,0x00000044,0x00000007,0x0004003b,
+    0x00000009,0x00000045,0x00000007,0x0004003b,0x00000009,0x00000047,0x00000007,0x0004003b,
+    0x00000009,0x00000052,0x00000007,0x0004003b,0x00000009,0x00000058,0x00000007,0x0004003b,
+    0x00000009,0x0000007d,0x00000007,0x0004003b,0x00000009,0x00000082,0x00000007,0x0004003b,
+    0x00000009,0x000000a1,0x00000007,0x00050041,0x0000000f,0x00000010,0x0000000d,0x0000000e,
+    0x0004003d,0x00000008,0x00000011,0x00000010,0x0003003e,0x0000000a,0x00000011,0x0004003d,
+    0x00000008,0x00000019,0x0000000a,0x00050084,0x00000008,0x0000001a,0x00000018,0x00000019,
+    0x00050082,0x00000008,0x0000001b,0x0000001a,0x0000000e,0x00060041,0x0000000f,0x0000001c,
+    0x00000016,0x00000017,0x0000001b,0x0004003d,0x00000008,0x0000001d,0x0000001c,0x0003003e,
+    0x00000012,0x0000001d,0x0004003d,0x00000008,0x00000020,0x0000000a,0x00050084,0x00000008,
+    0x00000021,0x0000001f,0x00000020,0x00050082,0x00000008,0x00000022,0x00000021,0x0000000e,
+    0x00060041,0x0000000f,0x00000023,0x00000016,0x00000017,0x00000022,0x0004003d,0x00000008,
+    0x00000024,0x00000023,0x0003003e,0x0000001e,0x00000024,0x0004003d,0x00000008,0x00000026,
+    0x0000000a,0x00050082,0x00000008,0x00000027,0x00000026,0x0000000e,0x00060041,0x0000000f,
+    0x00000028,0x00000016,0x00000017,0x00000027,0x0004003d,0x00000008,0x00000029,0x00000028,
+    0x0003003e,0x00000025,0x00000029,0x0004003d,0x00000008,0x0000002f,0x0000000a,0x00050082,
+    0x00000008,0x00000030,0x0000002f,0x0000000e,0x00060041,0x0000000f,0x00000031,0x0000002e,
+    0x00000017,0x00000030,0x0004003d,0x00000008,0x00000032,0x00000031,0x0003003e,0x0000002a,
+    0x00000032,0x0004003d,0x00000008,0x00000034,0x0000000a,0x00050082,0x00000008,0x00000035,
+    0x00000034,0x00000018,0x00060041,0x0000000f,0x00000036,0x0000002e,0x00000017,0x00000035,
+    0x0004003d,0x00000008,0x00000037,0x00000036,0x0003003e,0x00000033,0x00000037,0x00050041,
+    0x0000003e,0x0000003f,0x0000003c,0x0000003d,0x0004003d,0x00000039,0x00000040,0x0000003f,
+    0x0004007c,0x00000008,0x00000041,0x00000040,0x0003003e,0x00000038,0x00000041,0x0003003e,
+    0x00000042,0x00000017,0x0003003e,0x00000043,0x00000017,0x0003003e,0x00000044,0x00000017,
+    0x0004003d,0x00000008,0x00000046,0x00000038,0x0003003e,0x00000045,0x00000046,0x0004003d,
+    0x00000008,0x00000048,0x0000000a,0x00050082,0x00000008,0x00000049,0x00000048,0x0000001f,
+    0x0003003e,0x00000047,0x00000049,0x000200f9,0x0000004a,0x000200f8,0x0000004a,0x000400f6,
+    0x0000004c,0x0000004d,0x00000000,0x000200f9,0x0000004e,0x000200f8,0x0000004e,0x0004003d,
+    0x00000008,0x0000004f,0x00000047,0x000500af,0x00000050,0x00000051,0x0000004f,0x00000017,
+    0x000400fa,0x00000051,0x0000004b,0x0000004c,0x000200f8,0x0000004b,0x0004003d,0x00000008,
+    0x00000053,0x00000045,0x0004003d,0x00000008,0x00000054,0x00000047,0x00060041,0x0000000f,
+    0x00000055,0x0000002e,0x00000017,0x00000054,0x0004003d,0x00000008,0x00000056,0x00000055,
+    0x00050087,0x00000008,0x00000057,0x00000053,0x00000056,0x0003003e,0x00000052,0x00000057,
+    0x0004003d,0x00000008,0x00000059,0x00000045,0x0004003d,0x00000008,0x0000005a,0x00000052,
+    0x0004003d,0x00000008,0x0000005b,0x00000047,0x00060041,0x0000000f,0x0000005c,0x0000002e,
+    0x00000017,0x0000005b,0x0004003d,0x00000008,0x0000005d,0x0000005c,0x00050084,0x00000008,
+    0x0000005e,0x0000005a,0x0000005d,0x00050082,0x00000008,0x0000005f,0x00000059,0x0000005e,
+    0x0003003e,0x00000058,0x0000005f,0x0004003d,0x00000008,0x00000060,0x00000058,0x0004003d,
+    0x00000008,0x00000061,0x0000000a,0x0004003d,0x00000008,0x00000062,0x00000047,0x00050080,
+    0x00000008,0x00000063,0x00000061,0x00000062,0x00060041,0x0000000f,0x00000064,0x00000016,
+    0x00000017,0x00000063,0x0004003d,0x00000008,0x00000065,0x00000064,0x00050084,0x00000008,
+    0x00000066,0x00000060,0x00000065,0x0004003d,0x00000008,0x00000067,0x00000042,0x00050080,
+    0x00000008,0x00000068,0x00000067,0x00000066,0x0003003e,0x00000042,0x00000068,0x0004003d,
+    0x00000008,0x00000069,0x00000058,0x0004003d,0x00000008,0x0000006a,0x0000000a,0x00050084,
+    0x00000008,0x0000006b,0x00000018,0x0000006a,0x0004003d,0x00000008,0x0000006c,0x00000047,
+    0x00050080,0x00000008,0x0000006d,0x0000006b,0x0000006c,0x00060041,0x0000000f,0x0000006e,
+    0x00000016,0x00000017,0x0000006d,0x0004003d,0x00000008,0x0000006f,0x0000006e,0x00050084,
+    0x00000008,0x00000070,0x00000069,0x0000006f,0x0004003d,0x00000008,0x00000071,0x00000043,
+    0x00050080,0x00000008,0x00000072,0x00000071,0x00000070,0x0003003e,0x00000043,0x00000072,
+    0x0004003d,0x00000008,0x00000073,0x00000058,0x0004003d,0x00000008,0x00000074,0x00000047,
+    0x00060041,0x0000000f,0x00000075,0x00000016,0x00000017,0x00000074,0x0004003d,0x00000008,
+    0x00000076,0x00000075,0x00050084,0x00000008,0x00000077,0x00000073,0x00000076,0x0004003d,
+    0x00000008,0x00000078,0x00000044,0x00050080,0x00000008,0x00000079,0x00000078,0x00000077,
+    0x0003003e,0x00000044,0x00000079,0x0004003d,0x00000008,0x0000007a,0x00000052,0x0003003e,
+    0x00000045,0x0000007a,0x000200f9,0x0000004d,0x000200f8,0x0000004d,0x0004003d,0x00000008,
+    0x0000007b,0x00000047,0x00050082,0x00000008,0x0000007c,0x0000007b,0x0000000e,0x0003003e,
+    0x00000047,0x0000007c,0x000200f9,0x0000004a,0x000200f8,0x0000004c,0x00050041,0x0000003e,
+    0x0000007f,0x0000003c,0x0000007e,0x0004003d,0x00000039,0x00000080,0x0000007f,0x0004007c,
+    0x00000008,0x00000081,0x00000080,0x0003003e,0x0000007d,0x00000081,0x00050041,0x0000003e,
+    0x00000084,0x00000083,0x0000003d,0x0004003d,0x00000039,0x00000085,0x00000084,0x0004007c,
+    0x00000008,0x00000086,0x00000085,0x0003003e,0x00000082,0x00000086,0x0004003d,0x00000008,
+    0x00000087,0x0000007d,0x0004003d,0x00000008,0x00000088,0x0000000a,0x00050084,0x00000008,
+    0x00000089,0x00000018,0x00000088,0x00050082,0x00000008,0x0000008a,0x00000089,0x00000018,
+    0x00060041,0x0000000f,0x0000008b,0x00000016,0x00000017,0x0000008a,0x0004003d,0x00000008,
+    0x0000008c,0x0000008b,0x00050084,0x00000008,0x0000008d,0x00000087,0x0000008c,0x0004003d,
+    0x00000008,0x0000008e,0x00000042,0x00050080,0x00000008,0x0000008f,0x0000008e,0x0000008d,
+    0x0003003e,0x00000042,0x0000008f,0x0004003d,0x00000008,0x00000090,0x0000007d,0x0004003d,
+    0x00000008,0x00000091,0x0000000a,0x00050084,0x00000008,0x00000092,0x0000001f,0x00000091,
+    0x00050082,0x00000008,0x00000093,0x00000092,0x00000018,0x00060041,0x0000000f,0x00000094,
+    0x00000016,0x00000017,0x00000093,0x0004003d,0x00000008,0x00000095,0x00000094,0x00050084,
+    0x00000008,0x00000096,0x00000090,0x00000095,0x0004003d,0x00000008,0x00000097,0x00000043,
+    0x00050080,0x00000008,0x00000098,0x00000097,0x00000096,0x0003003e,0x00000043,0x00000098,
+    0x0004003d,0x00000008,0x00000099,0x0000007d,0x0004003d,0x00000008,0x0000009a,0x0000000a,
+    0x00050082,0x00000008,0x0000009b,0x0000009a,0x00000018,0x00060041,0x0000000f,0x0000009c,
+    0x00000016,0x00000017,0x0000009b,0x0004003d,0x00000008,0x0000009d,0x0000009c,0x00050084,
+    0x00000008,0x0000009e,0x00000099,0x0000009d,0x0004003d,0x00000008,0x0000009f,0x00000044,
+    0x00050080,0x00000008,0x000000a0,0x0000009f,0x0000009e,0x0003003e,0x00000044,0x000000a0,
+    0x0004003d,0x00000008,0x000000a2,0x00000082,0x0003003e,0x000000a1,0x000000a2,0x000200f9,
+    0x000000a3,0x000200f8,0x000000a3,0x000400f6,0x000000a5,0x000000a6,0x00000000,0x000200f9,
+    0x000000a7,0x000200f8,0x000000a7,0x0004003d,0x00000008,0x000000a8,0x000000a1,0x0004003d,
+    0x00000008,0x000000a9,0x0000002a,0x000500b1,0x00000050,0x000000aa,0x000000a8,0x000000a9,
+    0x000400fa,0x000000aa,0x000000a4,0x000000a5,0x000200f8,0x000000a4,0x00050041,0x0000000f,
+    0x000000ab,0x0000000d,0x00000017,0x0004003d,0x00000008,0x000000ac,0x000000ab,0x000300f7,
+    0x000000b1,0x00000000,0x000b00fb,0x000000ac,0x000000b1,0x00000012,0x000000ad,0x00000010,
+    0x000000ae,0x0000000f,0x000000af,0x00000013,0x000000b0,0x000200f8,0x000000ad,0x0004003d,
+    0x00000008,0x000000b7,0x00000044,0x0004003d,0x00000008,0x000000b8,0x000000a1,0x0004003d,
+    0x00000008,0x000000b9,0x00000025,0x00050084,0x00000008,0x000000ba,0x000000b8,0x000000b9,
+    0x00050080,0x00000008,0x000000bb,0x000000b7,0x000000ba,0x0004003d,0x00000008,0x000000c0,
+    0x00000042,0x0004003d,0x00000008,0x000000c1,0x000000a1,0x0004003d,0x00000008,0x000000c2,
+    0x00000012,0x00050084,0x00000008,0x000000c3,0x000000c1,0x000000c2,0x00050080,0x00000008,
+    0x000000c4,0x000000c0,0x000000c3,0x00060041,0x000000c5,0x000000c6,0x000000bf,0x00000017,
+    0x000000c4,0x0004003d,0x000000b2,0x000000c7,0x000000c6,0x0004003d,0x00000008,0x000000cc,
+    0x00000043,0x0004003d,0x00000008,0x000000cd,0x000000a1,0x0004003d,0x00000008,0x000000ce,
+    0x0000001e,0x00050084,0x00000008,0x000000cf,0x000000cd,0x000000ce,0x00050080,0x00000008,
+    0x000000d0,0x000000cc,0x000000cf,0x00060041,0x000000c5,0x000000d1,0x000000cb,0x00000017,
+    0x000000d0,0x0004003d,0x000000b2,0x000000d2,0x000000d1,0x00050081,0x000000b2,0x000000d3,
+    0x000000c7,0x000000d2,0x00060041,0x000000c5,0x000000d4,0x000000b6,0x00000017,0x000000bb,
+    0x0003003e,0x000000d4,0x000000d3,0x000200f9,0x000000b1,0x000200f8,0x000000ae,0x0004003d,
+    0x00000008,0x000000d6,0x00000044,0x0004003d,0x00000008,0x000000d7,0x000000a1,0x0004003d,
+    0x00000008,0x000000d8,0x00000025,0x00050084,0x00000008,0x000000d9,0x000000d7,0x000000d8,
+    0x00050080,0x00000008,0x000000da,0x000000d6,0x000000d9,0x0004003d,0x00000008,0x000000db,
+    0x00000042,0x0004003d,0x00000008,0x000000dc,0x000000a1,0x0004003d,0x00000008,0x000000dd,
+    0x00000012,0x00050084,0x00000008,0x000000de,0x000000dc,0x000000dd,0x00050080,0x00000008,
+    0x000000df,0x000000db,0x000000de,0x00060041,0x000000c5,0x000000e0,0x000000bf,0x00000017,
+    0x000000df,0x0004003d,0x000000b2,0x000000e1,0x000000e0,0x0004003d,0x00000008,0x000000e2,
+    0x00000043,0x0004003d,0x00000008,0x000000e3,0x000000a1,0x0004003d,0x00000008,0x000000e4,
+    0x0000001e,0x00050084,0x00000008,0x000000e5,0x000000e3,0x000000e4,0x00050080,0x00000008,
+    0x000000e6,0x000000e2,0x000000e5,0x00060041,0x000000c5,0x000000e7,0x000000cb,0x00000017,
+    0x000000e6,0x0004003d,0x000000b2,0x000000e8,0x000000e7,0x00050083,0x000000b2,0x000000e9,
+    0x000000e1,0x000000e8,0x00060041,0x000000c5,0x000000ea,0x000000b6,0x00000017,0x000000da,
+    0x0003003e,0x000000ea,0x000000e9,0x000200f9,0x000000b1,0x000200f8,0x000000af,0x0004003d,
+    0x00000008,0x000000ec,0x00000044,0x0004003d,0x00000008,0x000000ed,0x000000a1,0x0004003d,
+    0x00000008,0x000000ee,0x00000025,0x00050084,0x00000008,0x000000ef,0x000000ed,0x000000ee,
+    0x00050080,0x00000008,0x000000f0,0x000000ec,0x000000ef,0x0004003d,0x00000008,0x000000f1,
+    0x00000042,0x0004003d,0x00000008,0x000000f2,0x000000a1,0x0004003d,0x00000008,0x000000f3,
+    0x00000012,0x00050084,0x00000008,0x000000f4,0x000000f2,0x000000f3,0x00050080,0x00000008,
+    0x000000f5,0x000000f1,0x000000f4,0x00060041,0x000000c5,0x000000f6,0x000000bf,0x00000017,
+    0x000000f5,0x0004003d,0x000000b2,0x000000f7,0x000000f6,0x0004003d,0x00000008,0x000000f8,
+    0x00000043,0x0004003d,0x00000008,0x000000f9,0x000000a1,0x0004003d,0x00000008,0x000000fa,
+    0x0000001e,0x00050084,0x00000008,0x000000fb,0x000000f9,0x000000fa,0x00050080,0x00000008,
+    0x000000fc,0x000000f8,0x000000fb,0x00060041,0x000000c5,0x000000fd,0x000000cb,0x00000017,
+    0x000000fc,0x0004003d,0x000000b2,0x000000fe,0x000000fd,0x00050085,0x000000b2,0x000000ff,
+    0x000000f7,0x000000fe,0x00060041,0x000000c5,0x00000100,0x000000b6,0x00000017,0x000000f0,
+    0x0003003e,0x00000100,0x000000ff,0x000200f9,0x000000b1,0x000200f8,0x000000b0,0x0004003d,
+    0x00000008,0x00000102,0x00000044,0x0004003d,0x00000008,0x00000103,0x000000a1,0x0004003d,
+    0x00000008,0x00000104,0x00000025,0x00050084,0x00000008,0x00000105,0x00000103,0x00000104,
+    0x00050080,0x00000008,0x00000106,0x00000102,0x00000105,0x0004003d,0x00000008,0x00000107,
+    0x00000042,0x0004003d,0x00000008,0x00000108,0x000000a1,0x0004003d,0x00000008,0x00000109,
+    0x00000012,0x00050084,0x00000008,0x0000010a,0x00000108,0x00000109,0x00050080,0x00000008,
+    0x0000010b,0x00000107,0x0000010a,0x00060041,0x000000c5,0x0000010c,0x000000bf,0x00000017,
+    0x0000010b,0x0004003d,0x000000b2,0x0000010d,0x0000010c,0x0004003d,0x00000008,0x0000010e,
+    0x00000043,0x0004003d,0x00000008,0x0000010f,0x000000a1,0x0004003d,0x00000008,0x00000110,
+    0x0000001e,0x00050084,0x00000008,0x00000111,0x0000010f,0x00000110,0x00050080,0x00000008,
+    0x00000112,0x0000010e,0x00000111,0x00060041,0x000000c5,0x00000113,0x000000cb,0x00000017,
+    0x00000112,0x0004003d,0x000000b2,0x00000114,0x00000113,0x00050088,0x000000b2,0x00000115,
+    0x0000010d,0x00000114,0x00060041,0x000000c5,0x00000116,0x000000b6,0x00000017,0x00000106,
+    0x0003003e,0x00000116,0x00000115,0x000200f9,0x000000b1,0x000200f8,0x000000b1,0x000200f9,
+    0x000000a6,0x000200f8,0x000000a6,0x0004003d,0x00000008,0x0000011a,0x000000a1,0x00050080,
+    0x00000008,0x0000011b,0x0000011a,0x00000119,0x0003003e,0x000000a1,0x0000011b,0x000200f9,
+    0x000000a3,0x000200f8,0x000000a5,0x000100fd,0x00010038
+};
+
+}}} // namespace cv::dnn::vkcom
--- a/modules/dnn/src/vkcom/shader/spv_shader.cpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.cpp
@ -12,10 +12,11 @@ std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
 void initSPVMaps()
 {
    SPVMaps.insert(std::make_pair("conv_1x1_fast_spv", std::make_pair(conv_1x1_fast_spv, 3134)));
-    SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
+    SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
    SPVMaps.insert(std::make_pair("conv_depthwise_3x3_spv", std::make_pair(conv_depthwise_3x3_spv, 1977)));
    SPVMaps.insert(std::make_pair("conv_implicit_gemm_spv", std::make_pair(conv_implicit_gemm_spv, 3565)));
-    SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
+    SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
+    SPVMaps.insert(std::make_pair("nary_eltwise_binary_forward_spv", std::make_pair(nary_eltwise_binary_forward_spv, 1757)));
 }

 }}} // namespace cv::dnn::vkcom
--- a/modules/dnn/src/vkcom/shader/spv_shader.hpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.hpp
@ -9,10 +9,11 @@
 namespace cv { namespace dnn { namespace vkcom {

 extern const unsigned int conv_1x1_fast_spv[3134];
-extern const unsigned int gemm_spv[2902];
+extern const unsigned int conv_depthwise_spv[2092];
 extern const unsigned int conv_depthwise_3x3_spv[1977];
 extern const unsigned int conv_implicit_gemm_spv[3565];
-extern const unsigned int conv_depthwise_spv[2092];
+extern const unsigned int gemm_spv[2902];
+extern const unsigned int nary_eltwise_binary_forward_spv[1757];

 extern std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;

--- a/modules/dnn/src/vkcom/src/op_naryEltwise.cpp
+++ b/modules/dnn/src/vkcom/src/op_naryEltwise.cpp
@ -0,0 +1,197 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "internal.hpp"
+#include "../include/op_naryeltwise.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+#ifdef HAVE_VULKAN
+
+#define STEP_SIZE 65536
+
+#define MAX_GROUP_COUNT_X 65535
+#define MAX_GROUP_COUNT_Y 65535
+#define MAX_GROUP_COUNT_Z 65535
+
+OpNary::OpNary(const OpNary::OPERATION _naryOpType, int _ninputs, int _max_ndims,
+               const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps)
+                : naryOpType(_naryOpType), ninputs(_ninputs), max_ndims(_max_ndims)
+{
+    CV_Assert(ninputs > 1);
+
+    shapesBuf.resize((ninputs + 1) * max_ndims);
+    stepsBuf.resize((ninputs + 1) * max_ndims);
+    for (int i = 0; i <= ninputs; i++)
+    {
+        std::copy(shapes[i].begin(), shapes[i].end(), shapesBuf.data() + i * max_ndims);
+        std::copy(steps[i].begin(), steps[i].end(), stepsBuf.data() + i * max_ndims);
+    }
+
+    // TODO(VK): support more types of operation
+    switch(naryOpType) {
+        // case OPERATION::EQUAL:
+        // case OPERATION::GREATER:
+        // case OPERATION::GREATER_EQUAL:
+        // case OPERATION::LESS:
+        // case OPERATION::LESS_EQUAL:
+        // case OPERATION::POW:
+        // case OPERATION::BITSHIFT:
+        // case OPERATION::MOD:
+        case OPERATION::PROD:
+        case OPERATION::SUB:
+        case OPERATION::ADD:
+        case OPERATION::DIV:
+        // case OPERATION::AND:
+        // case OPERATION::OR:
+        // case OPERATION::XOR:
+        {
+            CV_Assert(ninputs == 2);
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeBinary;
+            shader_name = "nary_eltwise_binary_forward_spv";
+
+            // TODO(VK): confirm if this makes any sense
+            nplanes = std::accumulate(shapesBuf.data(), shapesBuf.data() + max_ndims - 2, 1, [](int32_t a, int32_t b) { return a * b; } );
+            N2 = shapesBuf.data()[max_ndims - 2];
+            N1 = shapesBuf.data()[max_ndims - 1];
+            CV_LOG_DEBUG(NULL, "max_ndims="<<max_ndims<<", nplanes="<<nplanes<<", N2="<<N2<<", N1="<<N1);
+            break;
+        }
+        case OPERATION::WHERE:
+        {
+            CV_Assert(ninputs == 3);
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeTrinary;
+            shader_name = "nary_eltwise_trinary_forward_spv";
+            break;
+        }
+        // case OPERATION::MAX:
+        // case OPERATION::MEAN:
+        // case OPERATION::MIN:
+        case OPERATION::SUM:
+        {
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeNary;
+            shader_name = "nary_eltwise_nary_forward_spv";
+            break;
+        }
+        //TODO(VK) add other cases
+        default:
+            CV_Error(Error::StsNotImplemented, "Unsupported nary operation type");
+    }
+    // TODO(VK): initialize OpNary class
+}
+
+void OpNary::firstForward()
+{
+    if (!firstForwardFinsh)
+    {
+        config.local_size_x = 1; // TODO(vk) determine local_size_y if necessary
+        config.local_size_y = 1; // TODO(vk) determine local_size_y if necessary
+        config.local_size_z = 1; // TODO(vk) determine local_size_z if necessary
+        computeGroupCount();
+        firstForwardFinsh = true;
+    }
+    else
+        return;
+}
+
+bool OpNary::binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
+{
+    std::vector<int32_t> param = {(int32_t)naryOpType, max_ndims};
+    std::vector<int32_t> paramSize = {(int32_t)param.size()};
+    std::vector<int32_t> dimSizes = {(ninputs + 1) * max_ndims};
+    std::vector<int32_t> actualSteps;
+
+    // TODO(VK): compute step for different dtype. Currently this is for kFormatFp32.
+    actualSteps.resize(stepsBuf.size());
+    std::transform(stepsBuf.data(), stepsBuf.data() + dimSizes[0], actualSteps.begin(), [](int32_t sz){ return sz / 4; });
+
+    Tensor paramTensor = Tensor(reinterpret_cast<const char *>(param.data()), paramSize, kFormatInt32, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+    Tensor shapeTensor = Tensor(reinterpret_cast<const char *>(shapesBuf.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
+    Tensor stepTensor = Tensor(reinterpret_cast<const char *>(actualSteps.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
+
+    destTypes = {
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input1
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input2
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // out
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, // param
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // shape
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // step
+    };
+
+
+    Ptr<Pipeline> pipeline = pipelineFactoryPtr->getPipeline(shader_name, destTypes);
+    Ptr<CommandBuffer> cmdBuffer = cmdPoolPtr->allocBuffer();
+    Ptr<Descriptor> desSet = pipeline->createSet();
+    VkCommandBuffer cmdBufferReal = cmdBuffer->get();
+
+    desSet->writeTensor(ins[0], 0);
+    desSet->writeTensor(ins[1], 1);
+    desSet->writeTensor(outs[0], 2);
+    desSet->writeTensor(paramTensor, 3);
+    desSet->writeTensor(shapeTensor, 4);
+    desSet->writeTensor(stepTensor, 5);
+
+    cmdBuffer->beginRecord();
+    pipeline->bind(cmdBufferReal, desSet->get());
+    vkCmdDispatch(cmdBufferReal, group_x_, group_y_, group_z_);
+    cmdBuffer->endRecord();
+    cmdPoolPtr->submitAndWait(cmdBufferReal);
+
+    return true;
+}
+
+bool OpNary::forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
+{
+
+    firstForward();
+
+    // TODO(VK): Support more dtypes. Currently only kFormatFp32 is supported.
+    for (auto &tensor: ins)
+    {
+        CV_Assert(tensor.getFormat() == kFormatFp32);
+    }
+    for (auto &tensor: outs)
+    {
+        CV_Assert(tensor.getFormat() == kFormatFp32);
+    }
+
+    switch(shaderType) {
+        case kNaryShaderTypeBinary: {
+            return binaryForward(ins, outs);
+            break;
+        }
+        default:
+            CV_Error(Error::StsNotImplemented, "Unsupported shader type invoked.");
+    }
+
+    return true;
+}
+
+bool OpNary::computeGroupCount()
+{
+    if (shaderType == kNaryShaderTypeBinary)
+    {
+        group_x_ = nplanes; // parallelism at plane level
+        group_y_ = N2;
+        group_z_ = 1;
+    }
+    else
+    {
+        CV_Error(CV_StsNotImplemented, "shader type is not supported at compute GroupCount.");
+    }
+
+    CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
+    CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
+    CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z);
+
+    return true;
+}
+
+#endif // HAVE_VULKAN
+
+}}} // namespace cv::dnn::vkcom
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1613,7 +1613,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
        {
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
            return;
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@ -79,6 +79,7 @@ set(gapi_srcs
    src/api/gframe.cpp
    src/api/gkernel.cpp
    src/api/gbackend.cpp
+    src/api/gcommon.cpp
    src/api/gproto.cpp
    src/api/gnode.cpp
    src/api/gcall.cpp
@ -121,8 +122,10 @@ set(gapi_srcs
    src/executor/gabstractstreamingexecutor.cpp
    src/executor/gexecutor.cpp
    src/executor/gtbbexecutor.cpp
+    src/executor/gthreadedexecutor.cpp
    src/executor/gstreamingexecutor.cpp
    src/executor/gasync.cpp
+    src/executor/thread_pool.cpp

    # CPU Backend (currently built-in)
    src/backends/cpu/gcpubackend.cpp
--- a/modules/gapi/include/opencv2/gapi/gcommon.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcommon.hpp
@ -263,12 +263,32 @@ struct graph_dump_path
 };
 /** @} */

+/**
+ * @brief Ask G-API to use threaded executor when cv::GComputation
+ * is compiled via cv::GComputation::compile method.
+ *
+ * Specifies a number of threads that should be used by executor.
+ */
+struct GAPI_EXPORTS use_threaded_executor
+{
+    use_threaded_executor();
+    explicit use_threaded_executor(const uint32_t nthreads);
+
+    uint32_t num_threads;
+};
+/** @} */
+
 namespace detail
 {
    template<> struct CompileArgTag<cv::graph_dump_path>
    {
        static const char* tag() { return "gapi.graph_dump_path"; }
    };
+
+    template<> struct CompileArgTag<cv::use_threaded_executor>
+    {
+        static const char* tag() { return "gapi.threaded_executor"; }
+    };
 }

 } // namespace cv
--- a/modules/gapi/src/api/gcommon.cpp
+++ b/modules/gapi/src/api/gcommon.cpp
@ -0,0 +1,18 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#include "precomp.hpp"
+
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/core/utility.hpp>
+
+cv::use_threaded_executor::use_threaded_executor()
+    : num_threads(cv::getNumThreads()) {
+}
+
+cv::use_threaded_executor::use_threaded_executor(const uint32_t nthreads)
+    : num_threads(nthreads) {
+}
--- a/modules/gapi/src/compiler/gcompiler.cpp
+++ b/modules/gapi/src/compiler/gcompiler.cpp
@ -33,6 +33,7 @@
 #include "compiler/passes/pattern_matching.hpp"

 #include "executor/gexecutor.hpp"
+#include "executor/gthreadedexecutor.hpp"
 #include "executor/gstreamingexecutor.hpp"
 #include "backends/common/gbackend.hpp"
 #include "backends/common/gmetabackend.hpp"
@ -452,8 +453,16 @@ cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
        .get<OutputMeta>().outMeta;
    // FIXME: select which executor will be actually used,
    // make GExecutor abstract.
-    std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));

+    auto use_threaded_exec = cv::gapi::getCompileArg<cv::use_threaded_executor>(m_args);
+    std::unique_ptr<GAbstractExecutor> pE;
+    if (use_threaded_exec) {
+        const auto num_threads = use_threaded_exec.value().num_threads;
+        GAPI_LOG_INFO(NULL, "Threaded executor with " << num_threads << " thread(s) will be used");
+        pE.reset(new GThreadedExecutor(num_threads, std::move(pg)));
+    } else {
+        pE.reset(new GExecutor(std::move(pg)));
+    }
    GCompiled compiled;
    compiled.priv().setup(m_metas, outMetas, std::move(pE));

--- a/modules/gapi/src/executor/gthreadedexecutor.cpp
+++ b/modules/gapi/src/executor/gthreadedexecutor.cpp
@ -0,0 +1,511 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/zip_range.hpp>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include "api/gproto_priv.hpp" // ptr(GRunArgP)
+#include "executor/gthreadedexecutor.hpp"
+#include "compiler/passes/passes.hpp"
+
+namespace cv {
+namespace gimpl {
+namespace magazine {
+namespace {
+
+void bindInArgExec(Mag& mag, const RcDesc &rc, const GRunArg &arg) {
+    if (rc.shape != GShape::GMAT) {
+        bindInArg(mag, rc, arg);
+        return;
+    }
+    auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
+    switch (arg.index()) {
+    case GRunArg::index_of<Mat>() :
+        mag_rmat = make_rmat<RMatOnMat>(util::get<Mat>(arg));
+        break;
+    case GRunArg::index_of<cv::RMat>() :
+        mag_rmat = util::get<cv::RMat>(arg);
+        break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+    // FIXME: has to take extra care about meta here for this particuluar
+    // case, just because this function exists at all
+    mag.meta<cv::RMat>()[rc.id] = arg.meta;
+}
+
+void bindOutArgExec(Mag& mag, const RcDesc &rc, const GRunArgP &arg) {
+    if (rc.shape != GShape::GMAT) {
+        bindOutArg(mag, rc, arg);
+        return;
+    }
+    auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
+    switch (arg.index()) {
+    case GRunArgP::index_of<Mat*>() :
+        mag_rmat = make_rmat<RMatOnMat>(*util::get<Mat*>(arg)); break;
+    case GRunArgP::index_of<cv::RMat*>() :
+        mag_rmat = *util::get<cv::RMat*>(arg); break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+}
+
+cv::GRunArgP getObjPtrExec(Mag& mag, const RcDesc &rc) {
+    if (rc.shape != GShape::GMAT) {
+        return getObjPtr(mag, rc);
+    }
+    return GRunArgP(&mag.slot<cv::RMat>()[rc.id]);
+}
+
+void writeBackExec(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg) {
+    if (rc.shape != GShape::GMAT) {
+        writeBack(mag, rc, g_arg);
+        return;
+    }
+
+    switch (g_arg.index()) {
+    case GRunArgP::index_of<cv::Mat*>() : {
+        // If there is a copy intrinsic at the end of the graph
+        // we need to actually copy the data to the user buffer
+        // since output runarg was optimized to simply point
+        // to the input of the copy kernel
+        // FIXME:
+        // Rework, find a better way to check if there should be
+        // a real copy (add a pass to StreamingBackend?)
+        // NB: In case RMat adapter not equal to "RMatOnMat" need to
+        // copy data back to the host as well.
+        auto& out_mat = *util::get<cv::Mat*>(g_arg);
+        const auto& rmat = mag.template slot<cv::RMat>().at(rc.id);
+        auto* adapter = rmat.get<RMatOnMat>();
+        if ((adapter != nullptr && out_mat.data != adapter->data()) ||
+            (adapter == nullptr)) {
+            auto view = rmat.access(RMat::Access::R);
+            asMat(view).copyTo(out_mat);
+        }
+        break;
+    }
+    case GRunArgP::index_of<cv::RMat*>() : /* do nothing */ break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+}
+
+void assignMetaStubExec(Mag& mag, const RcDesc &rc, const cv::GRunArg::Meta &meta) {
+    switch (rc.shape) {
+    case GShape::GARRAY:  mag.meta<cv::detail::VectorRef>()[rc.id] = meta; break;
+    case GShape::GOPAQUE: mag.meta<cv::detail::OpaqueRef>()[rc.id] = meta; break;
+    case GShape::GSCALAR: mag.meta<cv::Scalar>()[rc.id]            = meta; break;
+    case GShape::GFRAME:  mag.meta<cv::MediaFrame>()[rc.id]        = meta; break;
+    case GShape::GMAT:
+        mag.meta<cv::Mat>() [rc.id] = meta;
+        mag.meta<cv::RMat>()[rc.id] = meta;
+#if !defined(GAPI_STANDALONE)
+        mag.meta<cv::UMat>()[rc.id] = meta;
+#endif
+        break;
+    default: util::throw_error(std::logic_error("Unsupported GShape type")); break;
+    }
+}
+
+} // anonymous namespace
+}}} // namespace cv::gimpl::magazine
+
+cv::gimpl::StreamMsg cv::gimpl::GThreadedExecutor::Input::get() {
+    std::lock_guard<std::mutex> lock{m_state.m};
+    cv::GRunArgs res;
+    for (const auto &rc : desc()) { res.emplace_back(magazine::getArg(m_state.mag, rc)); }
+    return cv::gimpl::StreamMsg{std::move(res)};
+}
+
+cv::gimpl::GThreadedExecutor::Input::Input(cv::gimpl::GraphState &state,
+                                           const std::vector<RcDesc> &rcs)
+    : m_state(state) {
+    set(rcs);
+};
+
+cv::GRunArgP cv::gimpl::GThreadedExecutor::Output::get(int idx) {
+    std::lock_guard<std::mutex> lock{m_state.m};
+    auto r = magazine::getObjPtrExec(m_state.mag, desc()[idx]);
+    // Remember the output port for this output object
+    m_out_idx[cv::gimpl::proto::ptr(r)] = idx;
+    return r;
+}
+
+void cv::gimpl::GThreadedExecutor::Output::post(cv::GRunArgP&&, const std::exception_ptr& e) {
+    if (e) {
+        m_eptr = e;
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::Output::post(Exception&& ex) {
+    m_eptr = std::move(ex.eptr);
+}
+
+void cv::gimpl::GThreadedExecutor::Output::meta(const GRunArgP &out, const GRunArg::Meta &m) {
+    const auto idx = m_out_idx.at(cv::gimpl::proto::ptr(out));
+    std::lock_guard<std::mutex> lock{m_state.m};
+    magazine::assignMetaStubExec(m_state.mag, desc()[idx], m);
+}
+
+cv::gimpl::GThreadedExecutor::Output::Output(cv::gimpl::GraphState &state,
+                                             const std::vector<RcDesc> &rcs)
+    : m_state(state) {
+    set(rcs);
+}
+
+void cv::gimpl::GThreadedExecutor::Output::verify() {
+    if (m_eptr) {
+        std::rethrow_exception(m_eptr);
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh) {
+    const Data &d = m_gm.metadata(orig_nh).get<Data>();
+
+    if (   d.storage != Data::Storage::INTERNAL
+        && d.storage != Data::Storage::CONST_VAL) {
+        return;
+    }
+
+    // INTERNALS+CONST only! no need to allocate/reset output objects
+    // to as it is bound externally (e.g. already in the m_state.mag)
+
+    switch (d.shape) {
+    case GShape::GMAT: {
+            // Let island allocate it's outputs if it can,
+            // allocate cv::Mat and wrap it with RMat otherwise
+            GAPI_Assert(!nh->inNodes().empty());
+            const auto desc = util::get<cv::GMatDesc>(d.meta);
+            auto& exec = m_gim.metadata(nh->inNodes().front()).get<IslandExec>().object;
+            auto& rmat = m_state.mag.slot<cv::RMat>()[d.rc];
+            if (exec->allocatesOutputs()) {
+                rmat = exec->allocate(desc);
+            } else {
+                Mat mat;
+                createMat(desc, mat);
+                rmat = make_rmat<RMatOnMat>(mat);
+            }
+        }
+        break;
+
+    case GShape::GSCALAR:
+        if (d.storage == Data::Storage::CONST_VAL) {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        break;
+
+    case GShape::GARRAY:
+        if (d.storage == Data::Storage::CONST_VAL) {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        break;
+    case GShape::GOPAQUE:
+        // Constructed on Reset, do nothing here
+        break;
+    case GShape::GFRAME: {
+        // Should be defined by backend, do nothing here
+        break;
+    }
+    default:
+        GAPI_Error("InternalError");
+    }
+}
+
+cv::gimpl::IslandActor::IslandActor(const std::vector<RcDesc>          &in_objects,
+                                    const std::vector<RcDesc>          &out_objects,
+                                    std::shared_ptr<GIslandExecutable> isl_exec,
+                                    cv::gimpl::GraphState              &state)
+    : m_isl_exec(isl_exec),
+      m_inputs(state, in_objects),
+      m_outputs(state, out_objects) {
+}
+
+void cv::gimpl::IslandActor::run() {
+    m_isl_exec->run(m_inputs, m_outputs);
+}
+
+void cv::gimpl::IslandActor::verify() {
+    m_outputs.verify();
+};
+
+class cv::gimpl::Task {
+    friend class TaskManager;
+public:
+    using Ptr = std::shared_ptr<Task>;
+    Task(TaskManager::F&& f, std::vector<Task::Ptr> &&producers);
+
+    struct ExecutionState {
+        cv::gapi::own::ThreadPool& tp;
+        cv::gapi::own::Latch& latch;
+    };
+
+    void run(ExecutionState& state);
+    bool isLast() const { return m_consumers.empty();  }
+    void reset()        { m_ready_producers.store(0u); }
+
+private:
+    TaskManager::F           m_f;
+    const uint32_t           m_num_producers;
+    std::atomic<uint32_t>    m_ready_producers;
+    std::vector<Task*>       m_consumers;
+};
+
+cv::gimpl::Task::Task(TaskManager::F         &&f,
+                      std::vector<Task::Ptr> &&producers)
+    : m_f(std::move(f)),
+      m_num_producers(static_cast<uint32_t>(producers.size())) {
+    for (auto producer : producers) {
+        producer->m_consumers.push_back(this);
+    }
+}
+
+void cv::gimpl::Task::run(ExecutionState& state) {
+    // Execute the task
+    m_f();
+    // Notify every consumer about completion one of its dependencies
+    for (auto* consumer : m_consumers) {
+        const auto num_ready =
+            consumer->m_ready_producers.fetch_add(1, std::memory_order_relaxed) + 1;
+        // The last completed producer schedule the consumer for execution
+        if (num_ready == consumer->m_num_producers) {
+            state.tp.schedule([&state, consumer](){
+                consumer->run(state);
+            });
+        }
+    }
+    // If tasks has no consumers this is the last task
+    // Execution lasts until all last tasks are completed
+    // Decrement the latch to notify about completion
+    if (isLast()) {
+        state.latch.count_down();
+    }
+}
+
+std::shared_ptr<cv::gimpl::Task>
+cv::gimpl::TaskManager::createTask(cv::gimpl::TaskManager::F &&f,
+                                   std::vector<std::shared_ptr<cv::gimpl::Task>> &&producers) {
+    const bool is_initial = producers.empty();
+    auto task = std::make_shared<cv::gimpl::Task>(std::move(f),
+                                                  std::move(producers));
+    m_all_tasks.emplace_back(task);
+    if (is_initial) {
+        m_initial_tasks.emplace_back(task);
+    }
+    return task;
+}
+
+void cv::gimpl::TaskManager::scheduleAndWait(cv::gapi::own::ThreadPool& tp) {
+    // Reset the number of ready dependencies for all tasks
+    for (auto& task : m_all_tasks) { task->reset(); }
+
+    // Count the number of last tasks
+    auto isLast = [](const std::shared_ptr<Task>& task) { return task->isLast(); };
+    const auto kNumLastsTasks =
+        std::count_if(m_all_tasks.begin(), m_all_tasks.end(), isLast);
+
+    // Initialize the latch, schedule initial tasks
+    // and wait until all lasts tasks are done
+    cv::gapi::own::Latch latch(kNumLastsTasks);
+    Task::ExecutionState state{tp, latch};
+    for (auto task : m_initial_tasks) {
+        state.tp.schedule([&state, task](){ task->run(state); });
+    }
+    latch.wait();
+}
+
+cv::gimpl::GThreadedExecutor::GThreadedExecutor(const uint32_t num_threads,
+                                                std::unique_ptr<ade::Graph> &&g_model)
+    : GAbstractExecutor(std::move(g_model)),
+      m_thread_pool(num_threads) {
+    auto sorted = m_gim.metadata().get<ade::passes::TopologicalSortData>();
+
+    std::unordered_map< ade::NodeHandle
+                       , std::shared_ptr<Task>
+                       , ade::HandleHasher<ade::Node>> m_tasks_map;
+    for (auto nh : sorted.nodes())
+    {
+        switch (m_gim.metadata(nh).get<NodeKind>().k)
+        {
+        case NodeKind::ISLAND:
+            {
+                std::vector<RcDesc> input_rcs;
+                std::vector<RcDesc> output_rcs;
+                input_rcs.reserve(nh->inNodes().size());
+                output_rcs.reserve(nh->outNodes().size());
+
+                auto xtract = [&](ade::NodeHandle slot_nh, std::vector<RcDesc> &vec) {
+                    const auto orig_data_nh
+                        = m_gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+                    const auto &orig_data_info
+                        = m_gm.metadata(orig_data_nh).get<Data>();
+                    vec.emplace_back(RcDesc{ orig_data_info.rc
+                                           , orig_data_info.shape
+                                           , orig_data_info.ctor});
+                };
+                for (auto in_slot_nh  : nh->inNodes())  xtract(in_slot_nh,  input_rcs);
+                for (auto out_slot_nh : nh->outNodes()) xtract(out_slot_nh, output_rcs);
+
+                auto actor = std::make_shared<IslandActor>(std::move(input_rcs),
+                                                           std::move(output_rcs),
+                                                           m_gim.metadata(nh).get<IslandExec>().object,
+                                                           m_state);
+                m_actors.push_back(actor);
+
+                std::unordered_set<ade::NodeHandle, ade::HandleHasher<ade::Node>> producer_nhs;
+                for (auto slot_nh : nh->inNodes()) {
+                    for (auto island_nh : slot_nh->inNodes()) {
+                        GAPI_Assert(m_gim.metadata(island_nh).get<NodeKind>().k == NodeKind::ISLAND);
+                        producer_nhs.emplace(island_nh);
+                    }
+                }
+                std::vector<std::shared_ptr<Task>> producers;
+                producers.reserve(producer_nhs.size());
+                for (auto producer_nh : producer_nhs) {
+                    producers.push_back(m_tasks_map.at(producer_nh));
+                }
+                auto task = m_task_manager.createTask(
+                        [actor](){actor->run();}, std::move(producers));
+                m_tasks_map.emplace(nh, task);
+            }
+            break;
+
+        case NodeKind::SLOT:
+            {
+                const auto orig_data_nh
+                    = m_gim.metadata(nh).get<DataSlot>().original_data_node;
+                initResource(nh, orig_data_nh);
+                m_slots.emplace_back(DataDesc{nh, orig_data_nh});
+            }
+            break;
+
+        default:
+            GAPI_Error("InternalError");
+            break;
+        } // switch(kind)
+    } // for(gim nodes)
+
+    prepareForNewStream();
+}
+
+void cv::gimpl::GThreadedExecutor::run(cv::gimpl::GRuntimeArgs &&args) {
+    const auto proto = m_gm.metadata().get<Protocol>();
+
+    // Basic check if input/output arguments are correct
+    // FIXME: Move to GCompiled (do once for all GExecutors)
+    if (proto.inputs.size() != args.inObjs.size()) { // TODO: Also check types
+        util::throw_error(std::logic_error
+                          ("Computation's input protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+    if (proto.outputs.size() != args.outObjs.size()) { // TODO: Also check types
+        util::throw_error(std::logic_error
+                          ("Computation's output protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+
+    namespace util = ade::util;
+
+    // ensure that output Mat parameters are correctly allocated
+    // FIXME: avoid copy of NodeHandle and GRunRsltComp ?
+    for (auto index : util::iota(proto.out_nhs.size())) {
+        auto& nh = proto.out_nhs.at(index);
+        const Data &d = m_gm.metadata(nh).get<Data>();
+        if (d.shape == GShape::GMAT) {
+            using cv::util::get;
+            const auto desc = get<cv::GMatDesc>(d.meta);
+
+            auto check_rmat = [&desc, &args, &index]() {
+                auto& out_mat = *get<cv::RMat*>(args.outObjs.at(index));
+                GAPI_Assert(desc.canDescribe(out_mat));
+            };
+
+#if !defined(GAPI_STANDALONE)
+            // Building as part of OpenCV - follow OpenCV behavior In
+            // the case of cv::Mat if output buffer is not enough to
+            // hold the result, reallocate it
+            if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
+                auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
+                createMat(desc, out_mat);
+            }
+            // In the case of RMat check to fit required meta
+            else {
+                check_rmat();
+            }
+#else
+            // Building standalone - output buffer should always exist,
+            // and _exact_ match our inferred metadata
+            if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
+                auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
+                GAPI_Assert(out_mat.data != nullptr &&
+                        desc.canDescribe(out_mat));
+            }
+            // In the case of RMat check to fit required meta
+            else {
+                check_rmat();
+            }
+#endif // !defined(GAPI_STANDALONE)
+        }
+    }
+    // Update storage with user-passed objects
+    for (auto it : ade::util::zip(ade::util::toRange(proto.inputs),
+                                  ade::util::toRange(args.inObjs))) {
+        magazine::bindInArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs))) {
+        magazine::bindOutArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+
+    // Reset internal data
+    for (auto &sd : m_slots) {
+        const auto& data = m_gm.metadata(sd.data_nh).get<Data>();
+        magazine::resetInternalData(m_state.mag, data);
+    }
+
+    m_task_manager.scheduleAndWait(m_thread_pool);
+    for (auto actor : m_actors) {
+        actor->verify();
+    }
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs))) {
+        magazine::writeBackExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+}
+
+bool cv::gimpl::GThreadedExecutor::canReshape() const {
+    for (auto actor : m_actors) {
+        if (actor->exec()->canReshape()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void cv::gimpl::GThreadedExecutor::reshape(const GMetaArgs& inMetas, const GCompileArgs& args) {
+    GAPI_Assert(canReshape());
+    auto& g = *m_orig_graph.get();
+    ade::passes::PassContext ctx{g};
+    passes::initMeta(ctx, inMetas);
+    passes::inferMeta(ctx, true);
+
+    // NB: Before reshape islands need to re-init resources for every slot.
+    for (auto slot : m_slots) {
+        initResource(slot.slot_nh, slot.data_nh);
+    }
+
+    for (auto actor : m_actors) {
+        actor->exec()->reshape(g, args);
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::prepareForNewStream() {
+    for (auto actor : m_actors) {
+        actor->exec()->handleNewStream();
+    }
+}
--- a/modules/gapi/src/executor/gthreadedexecutor.hpp
+++ b/modules/gapi/src/executor/gthreadedexecutor.hpp
@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
+#define OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
+
+#include <utility> // tuple, required by magazine
+#include <unordered_map> // required by magazine
+
+#include "executor/gabstractexecutor.hpp"
+#include "executor/thread_pool.hpp"
+
+namespace cv {
+namespace gimpl {
+
+class Task;
+class TaskManager {
+public:
+    using F = std::function<void()>;
+
+    std::shared_ptr<Task> createTask(F &&f, std::vector<std::shared_ptr<Task>> &&producers);
+    void scheduleAndWait(cv::gapi::own::ThreadPool& tp);
+
+private:
+    std::vector<std::shared_ptr<Task>> m_all_tasks;
+    std::vector<std::shared_ptr<Task>> m_initial_tasks;
+};
+
+struct GraphState {
+    Mag mag;
+    std::mutex m;
+};
+
+class IslandActor;
+class GThreadedExecutor final: public GAbstractExecutor {
+public:
+    class Input;
+    class Output;
+
+    explicit GThreadedExecutor(const uint32_t num_threads,
+                               std::unique_ptr<ade::Graph> &&g_model);
+    void run(cv::gimpl::GRuntimeArgs &&args) override;
+
+    bool canReshape() const override;
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs& args) override;
+
+    void prepareForNewStream() override;
+
+private:
+    struct DataDesc
+    {
+        ade::NodeHandle slot_nh;
+        ade::NodeHandle data_nh;
+    };
+
+    void initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh);
+
+    GraphState                                m_state;
+    std::vector<DataDesc>                     m_slots;
+    cv::gapi::own::ThreadPool                 m_thread_pool;
+    TaskManager                               m_task_manager;
+    std::vector<std::shared_ptr<IslandActor>> m_actors;
+};
+
+class GThreadedExecutor::Input final: public GIslandExecutable::IInput
+{
+public:
+    Input(GraphState& state, const std::vector<RcDesc> &rcs);
+
+private:
+    virtual StreamMsg get() override;
+    virtual StreamMsg try_get() override { return get(); }
+
+private:
+    GraphState& m_state;
+};
+
+class GThreadedExecutor::Output final: public GIslandExecutable::IOutput
+{
+public:
+    Output(GraphState &state, const std::vector<RcDesc> &rcs);
+    void verify();
+
+private:
+    GRunArgP get(int idx) override;
+    void post(cv::GRunArgP&&, const std::exception_ptr& e) override;
+    void post(Exception&& ex) override;
+    void post(EndOfStream&&) override {};
+    void meta(const GRunArgP &out, const GRunArg::Meta &m) override;
+
+private:
+    GraphState& m_state;
+    std::unordered_map<const void*, int> m_out_idx;
+    std::exception_ptr m_eptr;
+};
+
+class IslandActor {
+public:
+    using Ptr = std::shared_ptr<IslandActor>;
+    IslandActor(const std::vector<RcDesc>          &in_objects,
+                const std::vector<RcDesc>          &out_objects,
+                std::shared_ptr<GIslandExecutable> isl_exec,
+                GraphState                         &state);
+
+    void run();
+    void verify();
+    std::shared_ptr<GIslandExecutable> exec() { return m_isl_exec; }
+
+private:
+    std::shared_ptr<GIslandExecutable> m_isl_exec;
+    GThreadedExecutor::Input           m_inputs;
+    GThreadedExecutor::Output          m_outputs;
+};
+
+
+} // namespace gimpl
+} // namespace cv
+
+#endif // OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
--- a/modules/gapi/src/executor/thread_pool.cpp
+++ b/modules/gapi/src/executor/thread_pool.cpp
@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#include "thread_pool.hpp"
+
+#include <opencv2/gapi/util/throw.hpp>
+
+cv::gapi::own::Latch::Latch(const uint64_t expected)
+    : m_expected(expected) {
+}
+
+void cv::gapi::own::Latch::count_down() {
+    std::lock_guard<std::mutex> lk{m_mutex};
+    --m_expected;
+    if (m_expected == 0) {
+        m_all_done.notify_all();
+    }
+}
+
+void cv::gapi::own::Latch::wait() {
+    std::unique_lock<std::mutex> lk{m_mutex};
+    while (m_expected != 0u) {
+        m_all_done.wait(lk);
+    }
+}
+
+cv::gapi::own::ThreadPool::ThreadPool(const uint32_t num_workers) {
+    m_workers.reserve(num_workers);
+    for (uint32_t i = 0; i < num_workers; ++i) {
+        m_workers.emplace_back(
+                cv::gapi::own::ThreadPool::worker, std::ref(m_queue));
+    }
+}
+
+void cv::gapi::own::ThreadPool::worker(QueueClass<Task>& queue) {
+    while (true) {
+        cv::gapi::own::ThreadPool::Task task;
+        queue.pop(task);
+        if (!task) {
+            break;
+        }
+        task();
+    }
+}
+
+void cv::gapi::own::ThreadPool::schedule(cv::gapi::own::ThreadPool::Task&& task) {
+    m_queue.push(std::move(task));
+};
+
+void cv::gapi::own::ThreadPool::shutdown() {
+    for (size_t i = 0; i < m_workers.size(); ++i) {
+        // NB: Empty task - is an indicator for workers to stop their loops
+        m_queue.push({});
+    }
+    for (auto& worker : m_workers) {
+        worker.join();
+    }
+    m_workers.clear();
+}
+
+cv::gapi::own::ThreadPool::~ThreadPool() {
+    shutdown();
+}
--- a/modules/gapi/src/executor/thread_pool.hpp
+++ b/modules/gapi/src/executor/thread_pool.hpp
@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#ifndef OPENCV_GAPI_THREAD_POOL_HPP
+#define OPENCV_GAPI_THREAD_POOL_HPP
+
+#include <functional>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+#if defined(HAVE_TBB)
+#  include <tbb/concurrent_queue.h> // FIXME: drop it from here!
+template<typename T> using QueueClass = tbb::concurrent_bounded_queue<T>;
+#else
+#  include "executor/conc_queue.hpp"
+template<typename T> using QueueClass = cv::gapi::own::concurrent_bounded_queue<T>;
+#endif // TBB
+
+namespace cv {
+namespace gapi {
+namespace own {
+
+// NB: Only for tests
+class GAPI_EXPORTS Latch {
+public:
+    explicit Latch(const uint64_t expected);
+
+    Latch(const Latch&) = delete;
+    Latch& operator=(const Latch&) = delete;
+
+    void count_down();
+    void wait();
+
+private:
+    uint64_t                m_expected;
+    std::mutex              m_mutex;
+    std::condition_variable m_all_done;
+};
+
+// NB: Only for tests
+class GAPI_EXPORTS ThreadPool {
+public:
+    using Task = std::function<void()>;
+    explicit ThreadPool(const uint32_t num_workers);
+
+    ThreadPool(const ThreadPool&) = delete;
+    ThreadPool& operator=(const ThreadPool&) = delete;
+
+    void schedule(Task&& task);
+    ~ThreadPool();
+
+private:
+    static void worker(QueueClass<Task>& queue);
+    void shutdown();
+
+private:
+    std::vector<std::thread> m_workers;
+    QueueClass<Task>         m_queue;
+};
+
+}}} // namespace cv::gapi::own
+
+#endif // OPENCV_GAPI_THREAD_POOL_HPP
--- a/modules/gapi/test/gapi_sample_pipelines.cpp
+++ b/modules/gapi/test/gapi_sample_pipelines.cpp
@ -13,6 +13,8 @@

 #include <opencv2/gapi/core.hpp>

+#include "executor/thread_pool.hpp"
+
 namespace opencv_test
 {

@ -67,6 +69,38 @@ namespace
        }
    };

+    G_TYPED_KERNEL(GBusyWait, <GMat(GMat, uint32_t)>, "org.busy_wait") {
+        static GMatDesc outMeta(GMatDesc in, uint32_t)
+        {
+            return in;
+        }
+    };
+
+    GAPI_OCV_KERNEL(GOCVBusyWait, GBusyWait)
+    {
+        static void run(const cv::Mat& in,
+                        const uint32_t time_in_ms,
+                        cv::Mat&       out)
+        {
+            using namespace std::chrono;
+            auto s = high_resolution_clock::now();
+            in.copyTo(out);
+            auto e = high_resolution_clock::now();
+
+            const auto elapsed_in_ms =
+                static_cast<int32_t>(duration_cast<milliseconds>(e-s).count());
+
+            int32_t diff = time_in_ms - elapsed_in_ms;
+            const auto need_to_wait_in_ms = static_cast<uint32_t>(std::max(0, diff));
+
+            s = high_resolution_clock::now();
+            e = s;
+            while (duration_cast<milliseconds>(e-s).count() < need_to_wait_in_ms) {
+                e = high_resolution_clock::now();
+            }
+        }
+    };
+
    // These definitions test the correct macro work if the kernel has multiple output values
    G_TYPED_KERNEL(GRetGArrayTupleOfGMat2Kernel,  <GArray<std::tuple<GMat, GMat>>(GMat, Scalar)>,                                         "org.opencv.test.retarrayoftupleofgmat2kernel")  {};
    G_TYPED_KERNEL(GRetGArraTupleyOfGMat3Kernel,  <GArray<std::tuple<GMat, GMat, GMat>>(GMat)>,                                           "org.opencv.test.retarrayoftupleofgmat3kernel")  {};
@ -513,4 +547,29 @@ TEST(DISABLED_GAPI_Pipeline, 1DMatWithinSingleIsland)
    EXPECT_EQ(0, cv::norm(out_mat, ref_mat));
 }

+TEST(GAPI_Pipeline, BranchesExecutedInParallel)
+{
+    cv::GMat in;
+    // NB: cv::gapi::copy used to prevent fusing OCV backend operations
+    // into the single island where they will be executed in turn
+    auto out0 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out1 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out2 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out3 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out0,out1,out2,out3));
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat out_mat0, out_mat1, out_mat2, out_mat3;
+
+    using namespace std::chrono;
+    auto s = high_resolution_clock::now();
+    comp.apply(cv::gin(in_mat), cv::gout(out_mat0, out_mat1, out_mat2, out_mat3),
+               cv::compile_args(cv::use_threaded_executor(4u),
+                                cv::gapi::kernels<GOCVBusyWait>()));
+    auto e = high_resolution_clock::now();
+    const auto elapsed_in_ms = duration_cast<milliseconds>(e-s).count();;
+
+    EXPECT_GE(1200u, elapsed_in_ms);
+}
+
 } // namespace opencv_test
--- a/modules/gapi/test/own/thread_pool_tests.cpp
+++ b/modules/gapi/test/own/thread_pool_tests.cpp
@ -0,0 +1,124 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#include "../test_precomp.hpp"
+
+#include <chrono>
+#include <thread>
+
+#include "executor/thread_pool.hpp"
+
+namespace opencv_test
+{
+
+using namespace cv::gapi;
+
+TEST(ThreadPool, ScheduleNotBlock)
+{
+    own::Latch latch(1u);
+    std::atomic<uint32_t> counter{0u};
+
+    own::ThreadPool tp(4u);
+    tp.schedule([&](){
+        std::this_thread::sleep_for(std::chrono::milliseconds{500u});
+        counter++;
+        latch.count_down();
+    });
+
+    EXPECT_EQ(0u, counter);
+    latch.wait();
+    EXPECT_EQ(1u, counter);
+}
+
+TEST(ThreadPool, MultipleTasks)
+{
+    const uint32_t kNumTasks = 100u;
+    own::Latch latch(kNumTasks);
+    std::atomic<uint32_t> completed{0u};
+
+    own::ThreadPool tp(4u);
+    for (uint32_t i = 0; i < kNumTasks; ++i) {
+        tp.schedule([&]() {
+            ++completed;
+            latch.count_down();
+        });
+    }
+    latch.wait();
+
+    EXPECT_EQ(kNumTasks, completed.load());
+}
+
+struct ExecutionState {
+    ExecutionState(const uint32_t num_threads,
+                   const uint32_t num_tasks)
+        : guard(0u),
+          critical(0u),
+          limit(num_tasks),
+          latch(num_threads),
+          tp(num_threads) {
+    }
+
+    std::atomic<uint32_t> guard;
+    std::atomic<uint32_t> critical;
+    const uint32_t        limit;
+    own::Latch            latch;
+    own::ThreadPool       tp;
+};
+
+static void doRecursive(ExecutionState& state) {
+    // NB: Protects function to be executed no more than limit number of times
+    if (state.guard.fetch_add(1u) >= state.limit) {
+        state.latch.count_down();
+        return;
+    }
+    // NB: This simulates critical section
+    std::this_thread::sleep_for(std::chrono::milliseconds{50});
+    ++state.critical;
+    // NB: Schedule the new one recursively
+    state.tp.schedule([&](){ doRecursive(state); });
+}
+
+TEST(ThreadPool, ScheduleRecursively)
+{
+    const int kNumThreads = 5u;
+    const uint32_t kNumTasks = 100u;
+
+    ExecutionState state(kNumThreads, kNumTasks);
+    for (uint32_t i = 0; i < kNumThreads; ++i) {
+        state.tp.schedule([&](){
+            doRecursive(state);
+        });
+    }
+    state.latch.wait();
+
+    EXPECT_EQ(kNumTasks, state.critical.load());
+}
+
+TEST(ThreadPool, ExecutionIsParallel)
+{
+    const uint32_t kNumThreads = 4u;
+    std::atomic<uint32_t> counter{0};
+    own::Latch latch{kNumThreads};
+
+    own::ThreadPool tp(kNumThreads);
+    auto start = std::chrono::high_resolution_clock::now();
+    for (uint32_t i = 0; i < kNumThreads; ++i) {
+      tp.schedule([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds{800u});
+        ++counter;
+        latch.count_down();
+      });
+    }
+    latch.wait();
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+    EXPECT_GE(1000u, elapsed);
+    EXPECT_EQ(kNumThreads, counter.load());
+}
+
+} // namespace opencv_test
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@ -322,7 +322,8 @@ CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
@param buf Input array or vector of bytes.
@param flags The same flags as in cv::imread, see cv::ImreadModes.
@param dst The optional output placeholder for the decoded matrix. It can save the image
-reallocations when the function is called repeatedly for images of the same size.
+reallocations when the function is called repeatedly for images of the same size. In case of decoder
+failure the function returns empty cv::Mat object, but does not release user-provided dst buffer.
 */
 CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);

--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -782,7 +782,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )

    ImageDecoder decoder = findDecoder(buf_row);
    if( !decoder )
-        return 0;
+        return false;

    int scale_denom = 1;
    if( flags > IMREAD_LOAD_GDAL )
@ -803,7 +803,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
        filename = tempfile();
        FILE* f = fopen( filename.c_str(), "wb" );
        if( !f )
-            return 0;
+            return false;
        size_t bufSize = buf_row.total()*buf.elemSize();
        if (fwrite(buf_row.ptr(), 1, bufSize, f) != bufSize)
        {
@ -841,7 +841,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
                CV_LOG_WARNING(NULL, "unable to remove temporary file:" << filename);
            }
        }
-        return 0;
+        return false;
    }

    // established the required input image size
@ -887,7 +887,6 @@ imdecode_( const Mat& buf, int flags, Mat& mat )

    if (!success)
    {
-        mat.release();
        return false;
    }

@ -911,7 +910,8 @@ Mat imdecode( InputArray _buf, int flags )
    CV_TRACE_FUNCTION();

    Mat buf = _buf.getMat(), img;
-    imdecode_( buf, flags, img );
+    if (!imdecode_(buf, flags, img))
+        img.release();

    return img;
 }
@ -922,9 +922,10 @@ Mat imdecode( InputArray _buf, int flags, Mat* dst )

    Mat buf = _buf.getMat(), img;
    dst = dst ? dst : &img;
-    imdecode_( buf, flags, *dst );
-
-    return *dst;
+    if (imdecode_(buf, flags, *dst))
+        return *dst;
+    else
+        return cv::Mat();
 }

 static bool
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@ -482,6 +482,19 @@ TEST(Imgcodecs, write_parameter_type)
    EXPECT_EQ(0, remove(tmp_file.c_str()));
 }

+TEST(Imgcodecs, imdecode_user_buffer)
+{
+    cv::Mat encoded = cv::Mat::zeros(1, 1024, CV_8UC1);
+    cv::Mat user_buffer(1, 1024, CV_8UC1);
+    cv::Mat result = cv::imdecode(encoded, IMREAD_ANYCOLOR, &user_buffer);
+    EXPECT_TRUE(result.empty());
+    // the function does not release user-provided buffer
+    EXPECT_FALSE(user_buffer.empty());
+
+    result = cv::imdecode(encoded, IMREAD_ANYCOLOR);
+    EXPECT_TRUE(result.empty());
+}
+
 }} // namespace

 #if defined(HAVE_OPENEXR) && defined(OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS)
--- a/modules/java/android_sdk/CMakeLists.txt
+++ b/modules/java/android_sdk/CMakeLists.txt
@ -153,7 +153,6 @@ set(depends ${the_module}_android_source_copy "${OPENCV_DEPHELPER}/${the_module}

 # build jar
 set(AAR_FILE "${OPENCV_JAVA_DIR}/build/outputs/aar/opencv-release.aar")
-ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
 add_custom_command(
    OUTPUT "${AAR_FILE}" "${OPENCV_DEPHELPER}/${the_module}_android"
    COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "opencv:assemble"
--- a/modules/java/test/android_test/.classpath
+++ b/modules/java/test/android_test/.classpath
@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<classpath>
-	<classpathentry kind="src" path="src"/>
-	<classpathentry kind="src" path="gen"/>
-	<classpathentry kind="con" path="com.android.ide.eclipse.adt.ANDROID_FRAMEWORK"/>
-	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
-	<classpathentry kind="con" path="com.android.ide.eclipse.adt.LIBRARIES"/>
-	<classpathentry kind="output" path="bin/classes"/>
-</classpath>
--- a/modules/java/test/android_test/.project
+++ b/modules/java/test/android_test/.project
@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>OpenCV_JavaAPI_Tests</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.jdt.core.javabuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.ApkBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
-		<nature>org.eclipse.jdt.core.javanature</nature>
-	</natures>
-</projectDescription>
--- a/Show More
+++ b/Show More