mirror of
https://github.com/opencv/opencv.git
synced 2025-01-19 06:53:50 +08:00
Merge branch 4.x
This commit is contained in:
commit
3a55f50133
@ -65,6 +65,10 @@ if(POLICY CMP0068)
|
||||
cmake_policy(SET CMP0068 NEW) # CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
|
||||
endif()
|
||||
|
||||
if(POLICY CMP0071)
|
||||
cmake_policy(SET CMP0071 NEW) # CMake 3.10+: Let `AUTOMOC` and `AUTOUIC` process `GENERATED` files.
|
||||
endif()
|
||||
|
||||
if(POLICY CMP0075)
|
||||
cmake_policy(SET CMP0075 NEW) # CMake 3.12+: Include file check macros honor `CMAKE_REQUIRED_LIBRARIES`
|
||||
endif()
|
||||
@ -1870,7 +1874,7 @@ if(BUILD_JAVA)
|
||||
status(" JNI:" JNI_INCLUDE_DIRS THEN "${JNI_INCLUDE_DIRS}" ELSE NO)
|
||||
endif()
|
||||
status(" Java wrappers:" HAVE_opencv_java THEN "YES (${OPENCV_JAVA_SDK_BUILD_TYPE})" ELSE NO)
|
||||
status(" Java tests:" BUILD_TESTS AND opencv_test_java_BINARY_DIR THEN YES ELSE NO)
|
||||
status(" Java tests:" BUILD_TESTS AND (opencv_test_java_BINARY_DIR OR opencv_test_android_BINARY_DIR) THEN YES ELSE NO)
|
||||
endif()
|
||||
|
||||
# ========================== Objective-C =======================
|
||||
|
@ -141,6 +141,8 @@ if (gradle.opencv_source == 'sdk_path') {
|
||||
")
|
||||
|
||||
ocv_check_environment_variables(OPENCV_GRADLE_VERBOSE_OPTIONS)
|
||||
ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
|
||||
separate_arguments(OPENCV_GRADLE_VERBOSE_OPTIONS UNIX_COMMAND "${OPENCV_GRADLE_VERBOSE_OPTIONS}")
|
||||
|
||||
macro(add_android_project target path)
|
||||
get_filename_component(__dir "${path}" NAME)
|
||||
@ -175,7 +177,6 @@ include ':${__dir}'
|
||||
if (BUILD_ANDROID_EXAMPLES)
|
||||
# build apk
|
||||
set(APK_FILE "${ANDROID_BUILD_BASE_DIR}/${__dir}/build/outputs/apk/release/${__dir}-${ANDROID_ABI}-release-unsigned.apk")
|
||||
ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
|
||||
add_custom_command(
|
||||
OUTPUT "${APK_FILE}" "${OPENCV_DEPHELPER}/android_sample_${__dir}"
|
||||
COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "${__dir}:assemble"
|
||||
|
@ -9,46 +9,224 @@ YOLO DNNs {#tutorial_dnn_yolo}
|
||||
| | |
|
||||
| -: | :- |
|
||||
| Original author | Alessandro de Oliveira Faria |
|
||||
| Compatibility | OpenCV >= 3.3.1 |
|
||||
| Extended by | Abduragim Shtanchaev |
|
||||
| Compatibility | OpenCV >= 4.9.0 |
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
In this text you will learn how to use opencv_dnn module using yolo_object_detection (Sample of using OpenCV dnn module in real time with device capture, video and image).
|
||||
Running pre-trained YOLO model in OpenCV
|
||||
----------------------------------------
|
||||
|
||||
We will demonstrate results of this example on the following picture.
|
||||
![Picture example](images/yolo.jpg)
|
||||
Deploying pre-trained models is a common task in machine learning, particularly when working with
|
||||
hardware that does not support certain frameworks like PyTorch. This guide provides a comprehensive
|
||||
overview of exporting pre-trained YOLO family models from PyTorch and deploying them using OpenCV's
|
||||
DNN framework. For demonstration purposes, we will focus on the [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main)
|
||||
model, but the methodology applies to other supported models.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@note Currently, OpenCV supports the following YOLO models:
|
||||
- [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main),
|
||||
- [YoloNas](https://github.com/Deci-AI/super-gradients/tree/master),
|
||||
- [YOLOv8](https://github.com/ultralytics/ultralytics/tree/main),
|
||||
- [YOLOv7](https://github.com/WongKinYiu/yolov7/tree/main),
|
||||
- [YOLOv6](https://github.com/meituan/YOLOv6/blob/main),
|
||||
- [YOLOv5](https://github.com/ultralytics/yolov5),
|
||||
- [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4).
|
||||
|
||||
This support includes pre and post-processing routines specific to these models. While other older
|
||||
version of YOLO are also supported by OpenCV in Darknet format, they are out of the scope of this tutorial.
|
||||
|
||||
|
||||
Assuming that we have successfully trained YOLOX model, the subsequent step involves exporting and
|
||||
running this model with OpenCV. There are several critical considerations to address before
|
||||
proceeding with this process. Let's delve into these aspects.
|
||||
|
||||
### YOLO's Pre-proccessing & Output
|
||||
|
||||
Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal.
|
||||
These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input
|
||||
sizes contingent upon the model's scale.
|
||||
|
||||
| Model Scale | Input Size |
|
||||
|--------------|--------------|
|
||||
| Small Models <sup>[1](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 416x416 |
|
||||
| Midsize Models <sup>[2](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 640x640 |
|
||||
| Large Models <sup>[3](https://github.com/meituan/YOLOv6/tree/main#benchmark)</sup>| 1280x1280 |
|
||||
|
||||
This table provides a quick reference to understand the different input dimensions commonly used in
|
||||
various YOLO models inputs. These are standard input shapes. Make sure you use input size that you
|
||||
trained model with, if it is differed from from the size mentioned in the table.
|
||||
|
||||
The next critical element in the process involves understanding the specifics of image pre-processing
|
||||
for YOLO detectors. While the fundamental pre-processing approach remains consistent across the YOLO
|
||||
family, there are subtle yet crucial differences that must be accounted for to avoid any degradation
|
||||
in performance. Key among these are the `resize type` and the `padding value` applied post-resize.
|
||||
For instance, the [YOLOX model](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/data/data_augment.py#L142)
|
||||
utilizes a `LetterBox` resize method and a padding value of `114.0`. It is imperative to ensure that
|
||||
these parameters, along with the normalization constants, are appropriately matched to the model being
|
||||
exported.
|
||||
|
||||
Regarding the model's output, it typically takes the form of a tensor with dimensions [BxNxC+5] or
|
||||
[BxNxC+4], where 'B' represents the batch size, 'N' denotes the number of anchors, and 'C' signifies
|
||||
the number of classes (for instance, 80 classes if the model is trained on the COCO dataset).
|
||||
The additional 5 in the former tensor structure corresponds to the objectness score (obj), confidence
|
||||
score (conf), and the bounding box coordinates (cx, cy, w, h). Notably, the YOLOv8 model's output
|
||||
is shaped as [BxNxC+4], where there is no explicit objectness score, and the object score is directly
|
||||
inferred from the class score. For the YOLOX model, specifically, it is also necessary to incorporate
|
||||
anchor points to rescale predictions back to the image domain. This step will be integrated into
|
||||
the ONNX graph, a process that we will detail further in the subsequent sections.
|
||||
|
||||
|
||||
### PyTorch Model Export
|
||||
|
||||
Now that we know know the parameters of the pre-precessing we can go on and export the model from
|
||||
Pytorch to ONNX graph. Since in this tutorial we are using YOLOX as our sample model, lets use its
|
||||
export for demonstration purposes (the process is identical for the rest of the YOLO detectors).
|
||||
To exporting YOLOX we can just use [export script](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/tools/export_onnx.py). Particularly we need following commands:
|
||||
|
||||
@code{.bash}
|
||||
git clone https://github.com/Megvii-BaseDetection/YOLOX.git
|
||||
cd YOLOX
|
||||
wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth # download pre-trained weights
|
||||
python3 -m tools.export_onnx --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth --decode_in_inference
|
||||
@endcode
|
||||
|
||||
**NOTE:** Here `--decode_in_inference` is to include anchor box creation in the ONNX graph itself.
|
||||
It sets [this value](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/models/yolo_head.py#L210C16-L210C39)
|
||||
to `True`, which subsequently includes anchor generation function.
|
||||
|
||||
Below we demonstrated the minimal version of the export script (which could be used for models other
|
||||
than YOLOX) in case it is needed. However, usually each YOLO repository has predefined export script.
|
||||
|
||||
@code{.py}
|
||||
import onnx
|
||||
import torch
|
||||
from onnxsim import simplify
|
||||
|
||||
# load the model state dict
|
||||
ckpt = torch.load(ckpt_file, map_location="cpu")
|
||||
model.load_state_dict(ckpt)
|
||||
|
||||
# prepare dummy input
|
||||
dummy_input = torch.randn(args.batch_size, 3, exp.test_size[0], exp.test_size[1])
|
||||
|
||||
#export the model
|
||||
torch.onnx._export(
|
||||
model,
|
||||
dummy_input,
|
||||
"yolox.onnx",
|
||||
input_names=["input"],
|
||||
output_names=["output"],
|
||||
dynamic_axes={"input": {0: 'batch'},
|
||||
"output": {0: 'batch'}})
|
||||
|
||||
# use onnx-simplifier to reduce reduent model.
|
||||
onnx_model = onnx.load(args.output_name)
|
||||
model_simp, check = simplify(onnx_model)
|
||||
assert check, "Simplified ONNX model could not be validated"
|
||||
onnx.save(model_simp, args.output_name)
|
||||
@endcode
|
||||
|
||||
### Running Yolo ONNX detector with OpenCV Sample
|
||||
|
||||
Once we have our ONNX graph of the model, we just simply can run with OpenCV's sample. To that we need to make sure:
|
||||
|
||||
1. OpenCV is build with -DBUILD_EXAMLES=ON flag.
|
||||
2. Navigate to the OpenCV's `build` directory
|
||||
3. Run the following command:
|
||||
|
||||
@code{.cpp}
|
||||
./bin/example_dnn_yolo_detector --input=<path_to_your_input_file> \
|
||||
--classes=<path_to_class_names_file> \
|
||||
--thr=<confidence_threshold> \
|
||||
--nms=<non_maximum_suppression_threshold> \
|
||||
--mean=<mean_normalization_value> \
|
||||
--scale=<scale_factor> \
|
||||
--yolo=<yolo_model_version> \
|
||||
--padvalue=<padding_value> \
|
||||
--paddingmode=<padding_mode> \
|
||||
--backend=<computation_backend> \
|
||||
--target=<target_computation_device>
|
||||
@endcode
|
||||
|
||||
VIDEO DEMO:
|
||||
@youtube{NHtRlndE2cg}
|
||||
|
||||
Source Code
|
||||
-----------
|
||||
- --input: File path to your input image or video. If omitted, it will capture frames from a camera.
|
||||
- --classes: File path to a text file containing class names for object detection.
|
||||
- --thr: Confidence threshold for detection (e.g., 0.5).
|
||||
- --nms: Non-maximum suppression threshold (e.g., 0.4).
|
||||
- --mean: Mean normalization value (e.g., 0.0 for no mean normalization).
|
||||
- --scale: Scale factor for input normalization (e.g., 1.0).
|
||||
- --yolo: YOLO model version (e.g., YOLOv3, YOLOv4, etc.).
|
||||
- --padvalue: Padding value used in pre-processing (e.g., 114.0).
|
||||
- --paddingmode: Method for handling image resizing and padding. Options: 0 (resize without extra processing), 1 (crop after resize), 2 (resize with aspect ratio preservation).
|
||||
- --backend: Selection of computation backend (0 for automatic, 1 for Halide, 2 for OpenVINO, etc.).
|
||||
- --target: Selection of target computation device (0 for CPU, 1 for OpenCL, etc.).
|
||||
- --device: Camera device number (0 for default camera). If `--input` is not provided camera with index 0 will used by default.
|
||||
|
||||
Use a universal sample for object detection models written
|
||||
[in C++](https://github.com/opencv/opencv/blob/5.x/samples/dnn/object_detection.cpp) and
|
||||
[in Python](https://github.com/opencv/opencv/blob/5.x/samples/dnn/object_detection.py) languages
|
||||
Here `mean`, `scale`, `padvalue`, `paddingmode` should exactly match those that we discussed
|
||||
in pre-processing section in order for the model to match result in PyTorch
|
||||
|
||||
Usage examples
|
||||
--------------
|
||||
To demonstrate how to run OpenCV YOLO samples without your own pretrained model, follow these instructions:
|
||||
|
||||
Execute in webcam:
|
||||
1. Ensure Python is installed on your platform.
|
||||
2. Confirm that OpenCV is built with the `-DBUILD_EXAMPLES=ON` flag.
|
||||
|
||||
@code{.bash}
|
||||
|
||||
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb
|
||||
Run the YOLOX detector(with default values):
|
||||
|
||||
@code{.sh}
|
||||
git clone https://github.com/opencv/opencv_extra.git
|
||||
cd opencv_extra/testdata/dnn
|
||||
python download_models.py yolox_s_inf_decoder
|
||||
cd ..
|
||||
export OPENCV_TEST_DATA_PATH=$(pwd)
|
||||
cd <build directory of OpenCV>
|
||||
./bin/example_dnn_yolo_detector
|
||||
@endcode
|
||||
|
||||
Execute with image or video file:
|
||||
This will execute the YOLOX detector with your camera. For YOLOv8 (for instance), follow these additional steps:
|
||||
|
||||
@code{.bash}
|
||||
|
||||
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb
|
||||
@code{.sh}
|
||||
cd opencv_extra/testdata/dnn
|
||||
python download_models.py yolov8
|
||||
cd ..
|
||||
export OPENCV_TEST_DATA_PATH=$(pwd)
|
||||
cd <build directory of OpenCV>
|
||||
|
||||
./bin/example_dnn_yolo_detector --model=onnx/models/yolov8n.onnx --yolo=yolov8 --mean=0.0 --scale=0.003921568627 --paddingmode=2 --padvalue=144.0 --thr=0.5 --nms=0.4 --rgb=0
|
||||
@endcode
|
||||
|
||||
Questions and suggestions email to: Alessandro de Oliveira Faria cabelo@opensuse.org or OpenCV Team.
|
||||
|
||||
### Building a Custom Pipeline
|
||||
|
||||
Sometimes there is a need to make some custom adjustments in the inference pipeline. With OpenCV DNN
|
||||
module this is also quite easy to achieve. Below we will outline the sample implementation details:
|
||||
|
||||
- Import required libraries
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp includes
|
||||
|
||||
- Read ONNX graph and create neural network model:
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp read_net
|
||||
|
||||
- Read image and pre-process it:
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp preprocess_params
|
||||
@snippet samples/dnn/yolo_detector.cpp preprocess_call
|
||||
@snippet samples/dnn/yolo_detector.cpp preprocess_call_func
|
||||
|
||||
- Inference:
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp forward_buffers
|
||||
@snippet samples/dnn/yolo_detector.cpp forward
|
||||
|
||||
- Post-Processing
|
||||
|
||||
All post-processing steps are implemented in function `yoloPostProcess`. Please pay attention,
|
||||
that NMS step is not included into onnx graph. Sample uses OpenCV function for it.
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp postprocess
|
||||
|
||||
- Draw predicted boxes
|
||||
|
||||
@snippet samples/dnn/yolo_detector.cpp draw_boxes
|
||||
|
@ -156,7 +156,8 @@ struct ChessBoardQuad
|
||||
float edge_len; // quad edge len, in pix^2
|
||||
// neighbors and corners are synced, i.e., neighbor 0 shares corner 0
|
||||
ChessBoardCorner *corners[4]; // Coordinates of quad corners
|
||||
struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors
|
||||
struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors. M.b. sparse.
|
||||
// Each neighbors element corresponds to quad corner, but not just sequential index.
|
||||
|
||||
ChessBoardQuad(int group_idx_ = -1) :
|
||||
count(0),
|
||||
@ -1701,12 +1702,12 @@ void ChessBoardDetector::findQuadNeighbors()
|
||||
continue;
|
||||
|
||||
// Check that each corner is a neighbor of different quads
|
||||
for(j = 0; j < closest_quad->count; j++ )
|
||||
for(j = 0; j < 4; j++ )
|
||||
{
|
||||
if (closest_quad->neighbors[j] == &cur_quad)
|
||||
break;
|
||||
}
|
||||
if (j < closest_quad->count)
|
||||
if (j < 4)
|
||||
continue;
|
||||
|
||||
// check whether the closest corner to closest_corner
|
||||
|
@ -556,6 +556,8 @@ The format of half precision floating point is defined in IEEE 754-2008.
|
||||
|
||||
@param src input array.
|
||||
@param dst output array.
|
||||
|
||||
@deprecated Use Mat::convertTo with CV_16F instead.
|
||||
*/
|
||||
CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);
|
||||
|
||||
|
@ -271,11 +271,11 @@ enum BorderTypes {
|
||||
BORDER_REFLECT = 2, //!< `fedcba|abcdefgh|hgfedcb`
|
||||
BORDER_WRAP = 3, //!< `cdefgh|abcdefgh|abcdefg`
|
||||
BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
|
||||
BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
|
||||
BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno` - Treats outliers as transparent.
|
||||
|
||||
BORDER_REFLECT101 = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
|
||||
BORDER_DEFAULT = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
|
||||
BORDER_ISOLATED = 16 //!< do not look outside of ROI
|
||||
BORDER_ISOLATED = 16 //!< Interpolation restricted within the ROI boundaries.
|
||||
};
|
||||
|
||||
//! @} core_array
|
||||
|
@ -860,6 +860,7 @@ class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
|
||||
public:
|
||||
inline __m128i operator()(const __m128i& a, const __m128i& b) const
|
||||
{
|
||||
CV_UNUSED(b);
|
||||
return a;
|
||||
}
|
||||
};
|
||||
@ -880,6 +881,7 @@ class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
|
||||
public:
|
||||
inline __m128i operator()(const __m128i& a, const __m128i& b) const
|
||||
{
|
||||
CV_UNUSED(a);
|
||||
return b;
|
||||
}
|
||||
};
|
||||
|
@ -490,12 +490,12 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
|
||||
|
||||
inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
{
|
||||
return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
|
||||
return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
|
||||
return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
|
||||
}
|
||||
|
||||
inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
|
||||
@ -553,7 +553,7 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
|
||||
|
||||
inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
|
||||
{
|
||||
return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
|
||||
return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
|
||||
}
|
||||
|
||||
inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
|
||||
@ -1429,7 +1429,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
|
||||
inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
|
||||
{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
|
||||
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
|
||||
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
@ -1438,7 +1438,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
|
||||
vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
|
||||
{ vse##elemsize##_v_##len(ptr, a.val, num); } \
|
||||
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
|
||||
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
|
||||
@ -1469,7 +1469,7 @@ inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
|
||||
inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
|
||||
{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
|
||||
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
|
||||
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
@ -1478,7 +1478,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
|
||||
vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
|
||||
{ vse##elemsize##_v_##len(ptr, a.val, num); } \
|
||||
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
||||
{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
|
||||
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
|
||||
@ -2034,30 +2034,23 @@ void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
|
||||
OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
|
||||
OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#endif
|
||||
|
||||
// saturating multiply 8-bit, 16-bit
|
||||
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
_Tpwvec c, d; \
|
||||
v_mul_expand(a, b, c, d); \
|
||||
return v_pack(c, d); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
|
||||
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
auto res = mul(a.val, b.val, num); \
|
||||
return _Tpvec(cvt(res, 0, num)); \
|
||||
} \
|
||||
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
|
||||
{ a = a * b; return a; }
|
||||
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, v_int16x8)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, v_int32x4)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, 32, vwmul_vv_i32m2, vnclip_wx_i16m1)
|
||||
OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
static const signed char popCountTable[256] =
|
||||
{
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
|
@ -127,6 +127,11 @@ public:
|
||||
CV_WRAP int singleFPConfig() const;
|
||||
CV_WRAP int halfFPConfig() const;
|
||||
|
||||
/// true if 'cl_khr_fp64' extension is available
|
||||
CV_WRAP bool hasFP64() const;
|
||||
/// true if 'cl_khr_fp16' extension is available
|
||||
CV_WRAP bool hasFP16() const;
|
||||
|
||||
CV_WRAP bool endianLittle() const;
|
||||
CV_WRAP bool errorCorrectionSupport() const;
|
||||
|
||||
|
@ -141,13 +141,13 @@ static void dumpOpenCLInformation()
|
||||
DUMP_MESSAGE_STDOUT(" Max memory allocation size = " << maxMemAllocSizeStr);
|
||||
DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
|
||||
|
||||
const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
|
||||
const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
|
||||
DUMP_MESSAGE_STDOUT(" Double support = " << doubleSupportStr);
|
||||
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
|
||||
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());
|
||||
|
||||
const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
|
||||
const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
|
||||
DUMP_MESSAGE_STDOUT(" Half support = " << halfSupportStr);
|
||||
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
|
||||
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());
|
||||
|
||||
const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
|
||||
DUMP_MESSAGE_STDOUT(" Host unified memory = " << isUnifiedMemoryStr);
|
||||
|
@ -558,7 +558,7 @@ public:
|
||||
//! returns the minimal up-right integer rectangle containing the rotated rectangle
|
||||
CV_WRAP Rect boundingRect() const;
|
||||
//! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
|
||||
Rect_<float> boundingRect2f() const;
|
||||
CV_WRAP Rect2f boundingRect2f() const;
|
||||
//! returns the rectangle mass center
|
||||
CV_PROP_RW Point2f center;
|
||||
//! returns width and height of the rectangle
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
@ -65,7 +65,7 @@ void* getSymbol_(LibHandle_t h, const char* symbolName)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return (void*)GetProcAddress(h, symbolName);
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
|
||||
return dlsym(h, symbolName);
|
||||
#endif
|
||||
}
|
||||
@ -79,7 +79,7 @@ LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
|
||||
# else
|
||||
return LoadLibraryW(filename.c_str());
|
||||
#endif
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
|
||||
void* handle = dlopen(filename.c_str(), RTLD_NOW);
|
||||
CV_LOG_IF_DEBUG(NULL, !handle, "dlopen() error: " << dlerror());
|
||||
return handle;
|
||||
@ -91,7 +91,7 @@ void libraryRelease_(LibHandle_t h)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
FreeLibrary(h);
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
|
||||
#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
|
||||
dlclose(h);
|
||||
#endif
|
||||
}
|
||||
|
@ -80,6 +80,187 @@ OCL_PERF_TEST_P(ConvertToFixture, ConvertTo,
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
|
||||
|
||||
//#define RUN_CONVERTFP16
|
||||
static Size convertFP16_srcSize(4000, 4000);
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP32FP16MatMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_32F;
|
||||
const int dtype = CV_16F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
Mat src(srcSize, type);
|
||||
Mat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP32FP16MatUMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_32F;
|
||||
const int dtype = CV_16F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
Mat src(srcSize, type);
|
||||
UMat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP32FP16UMatMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_32F;
|
||||
const int dtype = CV_16F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
UMat src(srcSize, type);
|
||||
Mat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP32FP16UMatUMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_32F;
|
||||
const int dtype = CV_16F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
UMat src(srcSize, type);
|
||||
UMat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP16FP32MatMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_16F;
|
||||
const int dtype = CV_32F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
Mat src(srcSize, type);
|
||||
Mat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP16FP32MatUMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_16F;
|
||||
const int dtype = CV_32F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
Mat src(srcSize, type);
|
||||
UMat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP16FP32UMatMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_16F;
|
||||
const int dtype = CV_32F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
UMat src(srcSize, type);
|
||||
Mat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
OCL_PERF_TEST(Core, ConvertFP16FP32UMatUMat)
|
||||
{
|
||||
const Size srcSize = convertFP16_srcSize;
|
||||
const int type = CV_16F;
|
||||
const int dtype = CV_32F;
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
|
||||
|
||||
UMat src(srcSize, type);
|
||||
UMat dst(srcSize, dtype);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
#ifdef RUN_CONVERTFP16
|
||||
OCL_TEST_CYCLE() convertFp16(src, dst);
|
||||
#else
|
||||
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
|
||||
#endif
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
|
||||
///////////// CopyTo ////////////////////////
|
||||
|
||||
typedef Size_MatType CopyToFixture;
|
||||
|
@ -80,52 +80,133 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
|
||||
size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
|
||||
return k.run(2, globalsize, NULL, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
|
||||
static bool ocl_convertTo(InputArray src_, OutputArray dst_, int ddepth, bool noScale, double alpha, double beta)
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
if( empty() )
|
||||
CV_Assert(ddepth >= 0);
|
||||
|
||||
int stype = src_.type();
|
||||
int sdepth = CV_MAT_DEPTH(stype);
|
||||
int cn = CV_MAT_CN(stype);
|
||||
|
||||
int dtype = CV_MAKETYPE(ddepth, cn);
|
||||
|
||||
int wdepth = (sdepth == CV_64F) ? CV_64F : CV_32F;
|
||||
|
||||
bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
|
||||
bool doubleCheck = true;
|
||||
if (needDouble)
|
||||
{
|
||||
_dst.release();
|
||||
return;
|
||||
doubleCheck = ocl::Device::getDefault().hasFP64();
|
||||
}
|
||||
bool halfCheck = true;
|
||||
bool needHalf = sdepth == CV_16F || ddepth == CV_16F;
|
||||
if (needHalf)
|
||||
{
|
||||
halfCheck = ocl::Device::getDefault().hasFP16();
|
||||
}
|
||||
|
||||
bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
|
||||
if (!doubleCheck)
|
||||
return false;
|
||||
if (!halfCheck)
|
||||
return false;
|
||||
|
||||
if( _type < 0 )
|
||||
_type = _dst.fixedType() ? _dst.type() : type();
|
||||
const int rowsPerWI = 4;
|
||||
|
||||
char cvt[2][50];
|
||||
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
|
||||
format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s -D rowsPerWI=%d%s%s%s",
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
|
||||
ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
|
||||
ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
|
||||
rowsPerWI,
|
||||
needDouble ? " -D DOUBLE_SUPPORT" : "",
|
||||
needHalf ? " -D HALF_SUPPORT" : "",
|
||||
noScale ? " -D NO_SCALE" : ""
|
||||
)
|
||||
);
|
||||
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat src = src_.getUMat();
|
||||
dst_.createSameSize(src_, dtype);
|
||||
UMat dst = dst_.getUMat();
|
||||
|
||||
float alphaf = (float)alpha, betaf = (float)beta;
|
||||
|
||||
if (noScale)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn));
|
||||
else if (wdepth == CV_32F)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
|
||||
else
|
||||
_type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alpha, beta);
|
||||
|
||||
int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
|
||||
if( sdepth == ddepth && noScale )
|
||||
size_t globalsize[2] = {
|
||||
(size_t)dst.cols * cn,
|
||||
divUp((size_t)dst.rows, rowsPerWI)
|
||||
};
|
||||
if (!k.run(2, globalsize, NULL, false))
|
||||
return false;
|
||||
|
||||
CV_IMPL_ADD(CV_IMPL_OCL);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
if (empty())
|
||||
{
|
||||
copyTo(_dst);
|
||||
dst.release();
|
||||
return;
|
||||
}
|
||||
|
||||
int stype = type();
|
||||
int sdepth = CV_MAT_DEPTH(stype);
|
||||
|
||||
int ddepth = sdepth;
|
||||
if (type_ >= 0)
|
||||
ddepth = CV_MAT_DEPTH(type_);
|
||||
else
|
||||
ddepth = dst.fixedType() ? dst.depth() : sdepth;
|
||||
|
||||
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
|
||||
if (sdepth == ddepth && noScale)
|
||||
{
|
||||
copyTo(dst);
|
||||
return;
|
||||
}
|
||||
|
||||
CV_OCL_RUN(dims <= 2 && dst.isUMat(),
|
||||
ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
|
||||
|
||||
int cn = channels();
|
||||
int dtype = CV_MAKETYPE(ddepth, cn);
|
||||
|
||||
Mat src = *this;
|
||||
bool allowTransposed = dims == 1 ||
|
||||
_dst.kind() == _InputArray::STD_VECTOR ||
|
||||
(_dst.fixedSize() && _dst.dims() == 1);
|
||||
_dst.create( dims, size, _type, -1, allowTransposed );
|
||||
Mat dst = _dst.getMat();
|
||||
dst.kind() == _InputArray::STD_VECTOR ||
|
||||
(dst.fixedSize() && dst.dims() == 1);
|
||||
dst.create( dims, size, dtype, -1, allowTransposed );
|
||||
Mat dstMat = dst.getMat();
|
||||
|
||||
BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
|
||||
double scale[] = {alpha, beta};
|
||||
int cn = channels();
|
||||
CV_Assert( func != 0 );
|
||||
|
||||
if( dims <= 2 )
|
||||
{
|
||||
Size sz = getContinuousSize2D(src, dst, cn);
|
||||
func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
|
||||
Size sz = getContinuousSize2D(src, dstMat, cn);
|
||||
func(src.data, src.step, 0, 0, dstMat.data, dstMat.step, sz, scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
const Mat* arrays[] = {&src, &dst, 0};
|
||||
const Mat* arrays[] = {&src, &dstMat, 0};
|
||||
uchar* ptrs[2] = {};
|
||||
NAryMatIterator it(arrays, ptrs);
|
||||
Size sz((int)(it.size*cn), 1);
|
||||
@ -135,6 +216,44 @@ void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) cons
|
||||
}
|
||||
}
|
||||
|
||||
void UMat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
if (empty())
|
||||
{
|
||||
dst.release();
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
int stype = type();
|
||||
int sdepth = CV_MAT_DEPTH(stype);
|
||||
|
||||
int ddepth = sdepth;
|
||||
if (type_ >= 0)
|
||||
ddepth = CV_MAT_DEPTH(type_);
|
||||
else
|
||||
ddepth = dst.fixedType() ? dst.depth() : sdepth;
|
||||
|
||||
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
|
||||
if (sdepth == ddepth && noScale)
|
||||
{
|
||||
copyTo(dst);
|
||||
return;
|
||||
}
|
||||
|
||||
CV_OCL_RUN(dims <= 2,
|
||||
ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
|
||||
#endif // HAVE_OPENCL
|
||||
|
||||
UMat src = *this; // Fake reference to itself.
|
||||
// Resolves issue 8693 in case of src == dst.
|
||||
Mat m = getMat(ACCESS_READ);
|
||||
m.convertTo(dst, type_, alpha, beta);
|
||||
(void)src;
|
||||
}
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
void convertFp16(InputArray _src, OutputArray _dst)
|
||||
|
@ -1604,6 +1604,9 @@ struct Device::Impl
|
||||
pos = pos2 + 1;
|
||||
}
|
||||
|
||||
khr_fp64_support_ = isExtensionSupported("cl_khr_fp64");
|
||||
khr_fp16_support_ = isExtensionSupported("cl_khr_fp16");
|
||||
|
||||
intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
|
||||
|
||||
vendorName_ = getStrProp(CL_DEVICE_VENDOR);
|
||||
@ -1692,7 +1695,9 @@ struct Device::Impl
|
||||
String version_;
|
||||
std::string extensions_;
|
||||
int doubleFPConfig_;
|
||||
bool khr_fp64_support_;
|
||||
int halfFPConfig_;
|
||||
bool khr_fp16_support_;
|
||||
bool hostUnifiedMemory_;
|
||||
int maxComputeUnits_;
|
||||
size_t maxWorkGroupSize_;
|
||||
@ -1844,6 +1849,11 @@ int Device::singleFPConfig() const
|
||||
int Device::halfFPConfig() const
|
||||
{ return p ? p->halfFPConfig_ : 0; }
|
||||
|
||||
bool Device::hasFP64() const
|
||||
{ return p ? p->khr_fp64_support_ : false; }
|
||||
bool Device::hasFP16() const
|
||||
{ return p ? p->khr_fp16_support_ : false; }
|
||||
|
||||
bool Device::endianLittle() const
|
||||
{ return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
|
||||
|
||||
|
@ -67,6 +67,9 @@ int Device::doubleFPConfig() const { OCL_NOT_AVAILABLE(); }
|
||||
int Device::singleFPConfig() const { OCL_NOT_AVAILABLE(); }
|
||||
int Device::halfFPConfig() const { OCL_NOT_AVAILABLE(); }
|
||||
|
||||
bool Device::hasFP64() const { OCL_NOT_AVAILABLE(); }
|
||||
bool Device::hasFP16() const { OCL_NOT_AVAILABLE(); }
|
||||
|
||||
bool Device::endianLittle() const { OCL_NOT_AVAILABLE(); }
|
||||
bool Device::errorCorrectionSupport() const { OCL_NOT_AVAILABLE(); }
|
||||
|
||||
|
@ -49,14 +49,21 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HALF_SUPPORT
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define noconvert
|
||||
|
||||
__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
|
||||
#ifndef NO_SCALE
|
||||
WT alpha, WT beta,
|
||||
, WT alpha, WT beta
|
||||
#endif
|
||||
int rowsPerWI)
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
@ -1268,70 +1268,10 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
|
||||
src.copyTo(_dst, _mask);
|
||||
}
|
||||
|
||||
void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
|
||||
int stype = type(), cn = CV_MAT_CN(stype);
|
||||
|
||||
if( _type < 0 )
|
||||
_type = _dst.fixedType() ? _dst.type() : stype;
|
||||
else
|
||||
_type = CV_MAKETYPE(CV_MAT_DEPTH(_type), cn);
|
||||
|
||||
int sdepth = CV_MAT_DEPTH(stype), ddepth = CV_MAT_DEPTH(_type);
|
||||
if( sdepth == ddepth && noScale )
|
||||
{
|
||||
copyTo(_dst);
|
||||
return;
|
||||
}
|
||||
#ifdef HAVE_OPENCL
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
|
||||
if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
|
||||
((needDouble && doubleSupport) || !needDouble) )
|
||||
{
|
||||
int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
|
||||
|
||||
char cvt[2][50];
|
||||
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
|
||||
format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s%s",
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
|
||||
ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
|
||||
ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", noScale ? " -D NO_SCALE" : ""));
|
||||
if (!k.empty())
|
||||
{
|
||||
UMat src = *this;
|
||||
_dst.create( size(), _type );
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
float alphaf = (float)alpha, betaf = (float)beta;
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
||||
dstarg = ocl::KernelArg::WriteOnly(dst, cn);
|
||||
|
||||
if (noScale)
|
||||
k.args(srcarg, dstarg, rowsPerWI);
|
||||
else if (wdepth == CV_32F)
|
||||
k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
|
||||
else
|
||||
k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
|
||||
|
||||
size_t globalsize[2] = { (size_t)dst.cols * cn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
|
||||
if (k.run(2, globalsize, NULL, false))
|
||||
{
|
||||
CV_IMPL_ADD(CV_IMPL_OCL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
UMat src = *this; // Fake reference to itself.
|
||||
// Resolves issue 8693 in case of src == dst.
|
||||
Mat m = getMat(ACCESS_READ);
|
||||
m.convertTo(_dst, _type, alpha, beta);
|
||||
}
|
||||
//
|
||||
// void UMat::convertTo moved to convert.dispatch.cpp
|
||||
//
|
||||
|
||||
UMat& UMat::setTo(InputArray _value, InputArray _mask)
|
||||
{
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include <errno.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h>
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
@ -194,7 +194,7 @@ cv::String getcwd()
|
||||
sz = GetCurrentDirectoryA((DWORD)buf.size(), buf.data());
|
||||
return cv::String(buf.data(), (size_t)sz);
|
||||
#endif
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
|
||||
for(;;)
|
||||
{
|
||||
char* p = ::getcwd(buf.data(), buf.size());
|
||||
@ -228,7 +228,7 @@ bool createDirectory(const cv::String& path)
|
||||
#else
|
||||
int result = _mkdir(path.c_str());
|
||||
#endif
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
|
||||
int result = mkdir(path.c_str(), 0777);
|
||||
#else
|
||||
int result = -1;
|
||||
@ -343,7 +343,7 @@ private:
|
||||
Impl& operator=(const Impl&); // disabled
|
||||
};
|
||||
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__
|
||||
|
||||
struct FileLock::Impl
|
||||
{
|
||||
|
@ -848,6 +848,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
|
||||
#ifdef HAVE_CUDA
|
||||
INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
|
||||
#endif
|
||||
#ifdef HAVE_VULKAN
|
||||
INSTANTIATE_TEST_CASE_P(VULKAN, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)));
|
||||
#endif
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
|
||||
|
@ -33,7 +33,7 @@ public:
|
||||
weights = findDataFile(weights, false);
|
||||
if (!proto.empty())
|
||||
proto = findDataFile(proto);
|
||||
net = readNet(proto, weights);
|
||||
net = readNet(weights, proto);
|
||||
// Set multiple inputs
|
||||
for(auto &inp: inputs){
|
||||
net.setInput(std::get<0>(inp), std::get<1>(inp));
|
||||
@ -214,7 +214,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
|
||||
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
|
||||
Mat sample = imread(findDataFile("dnn/dog416.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
|
||||
processNet("", "dnn/yolov5n.onnx", inp);
|
||||
processNet("dnn/yolov5n.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, YOLOv8)
|
||||
@ -226,7 +226,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv8)
|
||||
|
||||
Mat sample = imread(findDataFile("dnn/dog416.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
|
||||
processNet("", "dnn/yolov8n.onnx", inp);
|
||||
processNet("dnn/yolov8n.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, YOLOX) {
|
||||
@ -236,7 +236,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOX) {
|
||||
);
|
||||
Mat sample = imread(findDataFile("dnn/dog416.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
|
||||
processNet("", "dnn/yolox_s.onnx", inp);
|
||||
processNet("dnn/yolox_s.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
|
||||
@ -250,7 +250,7 @@ PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
|
||||
{
|
||||
applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
|
||||
|
||||
processNet("", "dnn/mosaic-9.onnx", cv::Size(224, 224));
|
||||
processNet("dnn/mosaic-9.onnx", "", cv::Size(224, 224));
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
|
||||
@ -291,15 +291,15 @@ PERF_TEST_P_(DNNTestNetwork, EfficientNet)
|
||||
Mat sample = imread(findDataFile("dnn/dog416.png"));
|
||||
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
|
||||
transposeND(inp, {0, 2, 3, 1}, inp);
|
||||
processNet("", "dnn/efficientnet-lite4.onnx", inp);
|
||||
processNet("dnn/efficientnet-lite4.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, YuNet) {
|
||||
processNet("", "dnn/onnx/models/yunet-202303.onnx", cv::Size(640, 640));
|
||||
processNet("dnn/onnx/models/yunet-202303.onnx", "", cv::Size(640, 640));
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, SFace) {
|
||||
processNet("", "dnn/face_recognition_sface_2021dec.onnx", cv::Size(112, 112));
|
||||
processNet("dnn/face_recognition_sface_2021dec.onnx", "", cv::Size(112, 112));
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, MPPalm) {
|
||||
@ -307,7 +307,7 @@ PERF_TEST_P_(DNNTestNetwork, MPPalm) {
|
||||
randu(inp, 0.0f, 1.0f);
|
||||
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
|
||||
transposeND(inp, {0, 2, 3, 1}, inp);
|
||||
processNet("", "dnn/palm_detection_mediapipe_2023feb.onnx", inp);
|
||||
processNet("dnn/palm_detection_mediapipe_2023feb.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, MPHand) {
|
||||
@ -315,7 +315,7 @@ PERF_TEST_P_(DNNTestNetwork, MPHand) {
|
||||
randu(inp, 0.0f, 1.0f);
|
||||
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
|
||||
transposeND(inp, {0, 2, 3, 1}, inp);
|
||||
processNet("", "dnn/handpose_estimation_mediapipe_2023feb.onnx", inp);
|
||||
processNet("dnn/handpose_estimation_mediapipe_2023feb.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, MPPose) {
|
||||
@ -323,23 +323,23 @@ PERF_TEST_P_(DNNTestNetwork, MPPose) {
|
||||
randu(inp, 0.0f, 1.0f);
|
||||
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
|
||||
transposeND(inp, {0, 2, 3, 1}, inp);
|
||||
processNet("", "dnn/pose_estimation_mediapipe_2023mar.onnx", inp);
|
||||
processNet("dnn/pose_estimation_mediapipe_2023mar.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
|
||||
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
|
||||
processNet("", "dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", cv::Size(736, 736));
|
||||
processNet("dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", "", cv::Size(736, 736));
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
|
||||
processNet("", "dnn/human_segmentation_pphumanseg_2023mar.onnx", cv::Size(192, 192));
|
||||
processNet("dnn/human_segmentation_pphumanseg_2023mar.onnx", "", cv::Size(192, 192));
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, CRNN) {
|
||||
Mat inp(cv::Size(100, 32), CV_32FC1);
|
||||
randu(inp, 0.0f, 1.0f);
|
||||
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
|
||||
processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", inp);
|
||||
processNet("dnn/text_recognition_CRNN_EN_2021sep.onnx", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, VitTrack) {
|
||||
@ -349,7 +349,7 @@ PERF_TEST_P_(DNNTestNetwork, VitTrack) {
|
||||
randu(inp2, 0.0f, 1.0f);
|
||||
inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
|
||||
inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
|
||||
processNet("", "dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
|
||||
processNet("dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", "", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
|
||||
@ -360,14 +360,14 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
|
||||
}
|
||||
Mat inp = imread(findDataFile("dnn/dog416.png"));
|
||||
inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
|
||||
processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", inp);
|
||||
processNet("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", "", inp);
|
||||
}
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, VIT_B_32)
|
||||
{
|
||||
applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
|
||||
|
||||
processNet("", "dnn/onnx/models/vit_b_32.onnx", cv::Size(224, 224));
|
||||
processNet("dnn/onnx/models/vit_b_32.onnx", "", cv::Size(224, 224));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
|
||||
|
@ -279,8 +279,8 @@ public:
|
||||
// Half precision floats.
|
||||
CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
|
||||
|
||||
Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
|
||||
convertFp16(halfs, dstBlob);
|
||||
Mat halfs((int)shape.size(), &shape[0], CV_16FC1, (void*)raw_data.c_str());
|
||||
halfs.convertTo(dstBlob, CV_32F);
|
||||
}
|
||||
else if (pbBlob.raw_data_type() == caffe::FLOAT)
|
||||
{
|
||||
|
@ -44,8 +44,8 @@ void shrinkCaffeModel(const String& src, const String& dst, const std::vector<St
|
||||
CV_Assert(blob->data_size() != 0); // float32 array.
|
||||
|
||||
Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
|
||||
Mat halfs(1, blob->data_size(), CV_16SC1);
|
||||
convertFp16(floats, halfs); // Convert to float16.
|
||||
Mat halfs(1, blob->data_size(), CV_16FC1);
|
||||
floats.convertTo(halfs, CV_16F); // Convert to float16.
|
||||
|
||||
blob->clear_data(); // Clear float32 data.
|
||||
|
||||
|
@ -502,7 +502,7 @@ void InfEngineNgraphNet::init(Target targetId)
|
||||
size_t total = ngraph::shape_size(constant->get_shape());
|
||||
Mat floats(1, total, CV_32F, (void*)floatsData);
|
||||
Mat halfs;
|
||||
cv::convertFp16(floats, halfs);
|
||||
floats.convertTo(halfs, CV_16F);
|
||||
|
||||
auto new_const = std::make_shared<ngraph::op::Constant>(ngraph::element::f16, constant->get_shape(), halfs.data);
|
||||
new_const->set_friendly_name(constant->get_friendly_name());
|
||||
|
@ -135,10 +135,10 @@ public:
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
{
|
||||
UMat inputFp32;
|
||||
convertFp16(inputs[0], inputFp32);
|
||||
inputs[0].convertTo(inputFp32, CV_32F);
|
||||
inputs[0] = inputFp32; // replace
|
||||
}
|
||||
|
||||
@ -264,10 +264,7 @@ public:
|
||||
UMat outputFp32;
|
||||
inputs[0].convertTo(outputFp32, CV_32F, scales[0], -(scales[0]*zeropoints[0]));
|
||||
|
||||
if (outputs_.depth() == CV_16S)
|
||||
convertFp16(outputFp32, outputs[0]);
|
||||
else
|
||||
outputFp32.copyTo(outputs[0]);
|
||||
outputFp32.convertTo(outputs[0], outputs_.depth());
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
@ -165,7 +165,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
|
||||
if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
@ -181,7 +181,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
|
||||
|
||||
inputs.resize(orig_inputs.size());
|
||||
for (size_t i = 0; i < orig_inputs.size(); i++)
|
||||
convertFp16(orig_inputs[i], inputs[i]);
|
||||
orig_inputs[i].convertTo(inputs[i], CV_32F);
|
||||
|
||||
outputs.resize(orig_outputs.size());
|
||||
for (size_t i = 0; i < orig_outputs.size(); i++)
|
||||
@ -194,7 +194,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
|
||||
forward(inputs, outputs, internals);
|
||||
|
||||
for (size_t i = 0; i < outputs.size(); i++)
|
||||
convertFp16(outputs[i], orig_outputs[i]);
|
||||
outputs[i].convertTo(orig_outputs[i], CV_16F);
|
||||
|
||||
// sync results back
|
||||
outputs_arr.assign(orig_outputs);
|
||||
|
@ -146,7 +146,7 @@ struct DataLayer : public Layer
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
bool isFP16 = outputs_arr.depth() == CV_16S;
|
||||
bool isFP16 = outputs_arr.depth() == CV_16F;
|
||||
|
||||
std::vector<Mat> outputs, internals;
|
||||
outputs_arr.getMatVector(outputs);
|
||||
@ -159,7 +159,7 @@ struct DataLayer : public Layer
|
||||
|
||||
CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
|
||||
if (isFP16)
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
|
||||
else
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
|
||||
|
||||
@ -175,7 +175,7 @@ struct DataLayer : public Layer
|
||||
{
|
||||
Mat input_f32;
|
||||
inputsData[i].convertTo(input_f32, CV_32F, scale, -mean[0] * scale);
|
||||
convertFp16(input_f32, outputs[i]);
|
||||
input_f32.convertTo(outputs[i], CV_16F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -194,7 +194,7 @@ struct DataLayer : public Layer
|
||||
{
|
||||
Mat input_f32;
|
||||
inp.convertTo(input_f32, CV_32F, scale, -mean[c] * scale);
|
||||
convertFp16(input_f32, out);
|
||||
input_f32.convertTo(out, CV_16F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -209,7 +209,7 @@ struct DataLayer : public Layer
|
||||
#ifdef HAVE_OPENCL
|
||||
bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
|
||||
{
|
||||
bool isFP16 = outputs_.depth() == CV_16S;
|
||||
bool isFP16 = outputs_.depth() == CV_16F;
|
||||
|
||||
std::vector<UMat> outputs;
|
||||
outputs_.getUMatVector(outputs);
|
||||
@ -223,7 +223,7 @@ struct DataLayer : public Layer
|
||||
|
||||
CV_Assert(mean == Scalar() || inputData.size[1] <= 4);
|
||||
if (isFP16)
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
|
||||
else
|
||||
CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
|
||||
|
||||
@ -239,7 +239,7 @@ struct DataLayer : public Layer
|
||||
{
|
||||
UMat input_i;
|
||||
inputData.convertTo(input_i, CV_32F, scale, -mean[0] * scale);
|
||||
convertFp16(input_i, outputs[i]);
|
||||
input_i.convertTo(outputs[i], CV_16F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -263,7 +263,7 @@ struct DataLayer : public Layer
|
||||
{
|
||||
UMat input_i;
|
||||
inp.convertTo(input_i, CV_32F, scale, -mean[c] * scale);
|
||||
convertFp16(input_i, out);
|
||||
input_i.convertTo(out, CV_16F);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -106,7 +106,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -190,7 +190,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
bool use_half = (inputs_.depth() == CV_16F);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
@ -264,7 +264,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -163,14 +163,14 @@ public:
|
||||
for( i = 0; i < ninputs; i++ )
|
||||
{
|
||||
Mat& inp = inputs[i];
|
||||
CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
|
||||
CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16F || inp.type() == CV_8S) &&
|
||||
inp.dims == 4 && inp.size[0] == output.size[0] &&
|
||||
inp.size[2] == output.size[2] &&
|
||||
inp.size[3] == output.size[3] );
|
||||
nchannels += inp.size[1];
|
||||
}
|
||||
CV_Assert( nchannels == output.size[1] );
|
||||
CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
|
||||
CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16F || output.type() == CV_8S) );
|
||||
|
||||
cc.chptrs.resize(nchannels*batchsz);
|
||||
|
||||
@ -221,7 +221,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
|
@ -62,12 +62,12 @@ public:
|
||||
{
|
||||
std::vector<UMat> outputs;
|
||||
outs.getUMatVector(outputs);
|
||||
if (outs.depth() == CV_16S) {
|
||||
if (outs.depth() == CV_16F) {
|
||||
auto blob = blobs[0];
|
||||
if (blob.type() != CV_32F) {
|
||||
blob.convertTo(blob, CV_32F);
|
||||
}
|
||||
convertFp16(blob, outputs[0]);
|
||||
blob.convertTo(outputs[0], CV_16F);
|
||||
}
|
||||
else
|
||||
blobs[0].convertTo(outputs[0], outputs[0].type());
|
||||
|
@ -139,7 +139,7 @@ public:
|
||||
}
|
||||
|
||||
const Mat &input = inputs[0];
|
||||
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
|
||||
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16F));
|
||||
for (size_t i = 0; i < outputs.size(); i++)
|
||||
{
|
||||
CV_Assert(inputs[i].type() == input.type());
|
||||
@ -929,7 +929,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -943,6 +943,7 @@ public:
|
||||
umat_blobs.resize(n);
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
CV_Assert(!use_half); // TODO: not implemented
|
||||
inputs[i + 1].copyTo(umat_blobs[i]);
|
||||
}
|
||||
inputs.resize(1);
|
||||
@ -955,7 +956,7 @@ public:
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
if (use_half)
|
||||
convertFp16(blobs[i], umat_blobs[i]);
|
||||
blobs[i].convertTo(umat_blobs[i], CV_16F);
|
||||
else
|
||||
blobs[i].copyTo(umat_blobs[i]);
|
||||
}
|
||||
@ -1036,7 +1037,7 @@ public:
|
||||
if (fusedWeights)
|
||||
{
|
||||
if (use_half)
|
||||
convertFp16(weightsMat, umat_blobs[0]);
|
||||
weightsMat.convertTo(umat_blobs[0], CV_16F);
|
||||
else
|
||||
weightsMat.copyTo(umat_blobs[0]);
|
||||
fusedWeights = false;
|
||||
@ -1046,7 +1047,7 @@ public:
|
||||
if ( umat_blobs.size() < 2 )
|
||||
umat_blobs.resize(2);
|
||||
if (use_half)
|
||||
convertFp16(Mat(biasvec, true), umat_blobs[1]);
|
||||
Mat(biasvec, true).convertTo(umat_blobs[1], CV_16F);
|
||||
else
|
||||
Mat(biasvec, true).copyTo(umat_blobs[1]);
|
||||
convolutionOp->setBias(true);
|
||||
@ -1109,7 +1110,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -1789,7 +1790,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
@ -1896,7 +1897,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr));
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -313,8 +313,7 @@ static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, cha
|
||||
const float* b = (const float*)b_;
|
||||
float* c = (float*)c_;
|
||||
|
||||
__m256i dummy;
|
||||
__m256 s00 = (__m256)__lasx_xvxor_v(dummy, dummy), s01 = s00,
|
||||
__m256 s00 = _v256_setall_ps(0), s01 = s00,
|
||||
s10 = s00, s11 = s00,
|
||||
s20 = s00, s21 = s00,
|
||||
s30 = s00, s31 = s00,
|
||||
|
@ -55,7 +55,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -37,7 +37,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -337,7 +337,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
if (use_half)
|
||||
{
|
||||
std::vector<UMat> orig_inputs;
|
||||
@ -345,7 +345,7 @@ public:
|
||||
|
||||
inputs.resize(orig_inputs.size());
|
||||
for (size_t i = 0; i < orig_inputs.size(); i++)
|
||||
convertFp16(orig_inputs[i], inputs[i]);
|
||||
orig_inputs[i].convertTo(inputs[i], CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -410,7 +410,7 @@ public:
|
||||
if (use_half)
|
||||
{
|
||||
UMat half_umat;
|
||||
convertFp16(umat, half_umat);
|
||||
umat.convertTo(half_umat, CV_16F);
|
||||
outs.assign(std::vector<UMat>(1, half_umat));
|
||||
}
|
||||
|
||||
@ -428,7 +428,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
}
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -454,7 +454,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -210,7 +210,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget),
|
||||
func.applyOCL(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -587,7 +587,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
|
||||
if ((inputs_.depth() == CV_16F && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
|
||||
return false;
|
||||
|
||||
if (hasVecInput)
|
||||
@ -607,7 +607,7 @@ public:
|
||||
size_t localsize[] = { 128 };
|
||||
size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
|
||||
String opts;
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
|
||||
else
|
||||
opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
|
||||
@ -633,7 +633,7 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
|
||||
@ -686,7 +686,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -105,7 +105,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -355,7 +355,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -383,9 +383,9 @@ public:
|
||||
|
||||
if (use_half)
|
||||
{
|
||||
convertFp16(A, A_fp32);
|
||||
convertFp16(B, B_fp32);
|
||||
convertFp16(C, C_fp32);
|
||||
A.convertTo(A_fp32, CV_32F);
|
||||
B.convertTo(B_fp32, CV_32F);
|
||||
C.convertTo(C_fp32, CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -396,9 +396,9 @@ public:
|
||||
cv::gemm(A_fp32, B_fp32, 1, noArray(), 0, C_fp32);
|
||||
if (use_half)
|
||||
{
|
||||
convertFp16(A_fp32, A);
|
||||
convertFp16(B_fp32, B);
|
||||
convertFp16(C_fp32, C);
|
||||
A_fp32.convertTo(A, CV_16F);
|
||||
B_fp32.convertTo(B, CV_16F);
|
||||
C_fp32.convertTo(C, CV_16F);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@ -429,7 +429,7 @@ public:
|
||||
for (int i = 0; i < umat_blobs.size(); i++)
|
||||
{
|
||||
if (!umat_blobs[i].empty())
|
||||
convertFp16(umat_blobs[i], half_blobs[i]);
|
||||
umat_blobs[i].convertTo(half_blobs[i], CV_16F);
|
||||
}
|
||||
}
|
||||
|
||||
@ -470,8 +470,8 @@ public:
|
||||
|
||||
if (use_half)
|
||||
{
|
||||
convertFp16(srcMat, srcMat_fp32);
|
||||
convertFp16(dstMat, dstMat_fp32);
|
||||
srcMat.convertTo(srcMat_fp32, CV_32F);
|
||||
dstMat.convertTo(dstMat_fp32, CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -489,8 +489,8 @@ public:
|
||||
}
|
||||
if (use_half)
|
||||
{
|
||||
convertFp16(srcMat_fp32, srcMat);
|
||||
convertFp16(dstMat_fp32, dstMat);
|
||||
srcMat_fp32.convertTo(srcMat, CV_16F);
|
||||
dstMat_fp32.convertTo(dstMat, CV_16F);
|
||||
}
|
||||
}
|
||||
|
||||
@ -506,7 +506,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && !isMatMul,
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -70,7 +70,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -57,12 +57,12 @@ public:
|
||||
const Mat& inp = inputs[0];
|
||||
|
||||
int indicesType = inputs[1].type();
|
||||
CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16SC1, "");
|
||||
CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16FC1, "");
|
||||
Mat indices32S;
|
||||
if (indicesType == CV_16S/*FP16*/)
|
||||
if (indicesType == CV_16F/*FP16*/)
|
||||
{
|
||||
Mat indicesF32;
|
||||
convertFp16(inputs[1], indicesF32);
|
||||
inputs[1].convertTo(indicesF32, CV_32F);
|
||||
indicesF32.convertTo(indices32S, CV_32S);
|
||||
}
|
||||
else
|
||||
|
@ -172,7 +172,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -60,7 +60,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S) {
|
||||
if (inputs_arr.depth() == CV_16F) {
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
@ -95,7 +95,7 @@ public:
|
||||
float inv_norm_size = 1.f / norm_size;
|
||||
|
||||
// no fp16 support
|
||||
if (input.depth() == CV_16S) {
|
||||
if (input.depth() == CV_16F) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -73,7 +73,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -107,7 +107,7 @@ public:
|
||||
float inv_norm_size = 1.f / norm_size;
|
||||
|
||||
// no fp16 support
|
||||
if (input.depth() == CV_16S) {
|
||||
if (input.depth() == CV_16F) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -99,7 +99,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -140,7 +140,7 @@ public:
|
||||
const auto &bias = inputs.size() == 3 ? inputs[2] : UMat::zeros(norm_size, 1, CV_32F);
|
||||
|
||||
// no fp16 support
|
||||
if (input.depth() == CV_16S) {
|
||||
if (input.depth() == CV_16F) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -119,7 +119,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -164,7 +164,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -119,7 +119,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -154,7 +154,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inputs_arr.depth() == CV_16S);
|
||||
bool use_half = (inputs_arr.depth() == CV_16F);
|
||||
inputs_arr.getUMatVector(inputs);
|
||||
outputs_arr.getUMatVector(outputs);
|
||||
|
||||
@ -192,9 +192,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
|
||||
}
|
||||
|
||||
if (use_half) {
|
||||
convertFp16(A, A_fp32);
|
||||
convertFp16(B, B_fp32);
|
||||
convertFp16(C, C_fp32);
|
||||
A.convertTo(A_fp32, CV_32F);
|
||||
B.convertTo(B_fp32, CV_32F);
|
||||
C.convertTo(C_fp32, CV_32F);
|
||||
} else {
|
||||
A_fp32 = A;
|
||||
B_fp32 = B;
|
||||
@ -203,9 +203,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
|
||||
|
||||
cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
|
||||
if (use_half) {
|
||||
convertFp16(A_fp32, A);
|
||||
convertFp16(B_fp32, B);
|
||||
convertFp16(C_fp32, C);
|
||||
A_fp32.convertTo(A, CV_16F);
|
||||
B_fp32.convertTo(B, CV_16F);
|
||||
C_fp32.convertTo(C, CV_16F);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -73,7 +73,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -149,7 +149,7 @@ public:
|
||||
UMat& bnorm_bias = umat_shift;
|
||||
|
||||
const unsigned LOCAL_SIZE = 128;
|
||||
bool use_half = (inputs[0].depth() == CV_16S);
|
||||
bool use_half = (inputs[0].depth() == CV_16F);
|
||||
String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
|
||||
use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
|
||||
LOCAL_SIZE
|
||||
@ -164,7 +164,7 @@ public:
|
||||
CV_Assert(newRows != 0);
|
||||
|
||||
MatShape s = shape(newRows, inpMat.total() / newRows);
|
||||
UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
|
||||
UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16F : CV_32F);
|
||||
UMat tmpMat = UMat(s[0], s[1], CV_32F);
|
||||
float alpha = 1.0f / s[1];
|
||||
|
||||
@ -226,7 +226,7 @@ public:
|
||||
if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
|
||||
return fast_forward_ocl(inputs, outputs);
|
||||
|
||||
if (inputs[0].depth() == CV_16S)
|
||||
if (inputs[0].depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
|
||||
@ -309,7 +309,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "../op_cuda.hpp"
|
||||
#include "../op_cann.hpp"
|
||||
#include "../ie_ngraph.hpp"
|
||||
#include "../op_vkcom.hpp"
|
||||
|
||||
#include <opencv2/dnn/shape_utils.hpp>
|
||||
|
||||
@ -34,8 +35,163 @@ static int _mod(int x, int y) {
|
||||
}
|
||||
}
|
||||
|
||||
class NaryEltwiseHelper CV_FINAL
|
||||
{
|
||||
public:
|
||||
int ninputs;
|
||||
int narrays;
|
||||
int max_ndims;
|
||||
std::vector<int> all_ndims;
|
||||
std::vector<std::vector<int>> orig_shapes;
|
||||
std::vector<std::vector<size_t>> orig_steps;
|
||||
std::vector<char*> ptrs;
|
||||
std::vector<std::vector<int>> shapes;
|
||||
std::vector<std::vector<size_t>> steps;
|
||||
std::vector<size_t> elemsize;
|
||||
|
||||
NaryEltwiseHelper() {
|
||||
}
|
||||
|
||||
void init(const std::vector<Mat>& inputs, const std::vector<Mat>& outputs)
|
||||
{
|
||||
narrays = 0;
|
||||
max_ndims = 0;
|
||||
all_ndims.clear();
|
||||
orig_shapes.clear();
|
||||
orig_steps.clear();
|
||||
ptrs.clear();
|
||||
shapes.clear();
|
||||
steps.clear();
|
||||
elemsize.clear();
|
||||
|
||||
ninputs = inputs.size();
|
||||
narrays = ninputs + 1;
|
||||
|
||||
// collect ndims
|
||||
std::vector<int> v_inp_dims;
|
||||
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
|
||||
const int* inp_ndims = v_inp_dims.data();
|
||||
int out_ndims = outputs[0].dims;
|
||||
|
||||
// find max ndims for broadcasting
|
||||
int i;
|
||||
max_ndims = out_ndims > 2 ? out_ndims : 2;
|
||||
for(i = 0; i < ninputs; i++)
|
||||
max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
|
||||
|
||||
shapes = std::vector<std::vector<int>>(narrays, std::vector<int>(max_ndims, 0));
|
||||
steps = std::vector<std::vector<size_t>>(narrays, std::vector<size_t>(max_ndims, 0));
|
||||
ptrs = std::vector<char*>(narrays, nullptr);
|
||||
|
||||
for(i = 0; i <= ninputs; i++) {
|
||||
all_ndims.push_back(i == 0 ? out_ndims : inp_ndims[i-1]);
|
||||
std::vector<int> _size;
|
||||
std::vector<size_t> _step;
|
||||
if (!i) {
|
||||
std::transform(outputs[0].size.p, outputs[0].size.p + outputs[0].dims, std::back_inserter(_size), [](int s) { return s; });
|
||||
std::transform(outputs[0].step.p, outputs[0].step.p + outputs[0].dims, std::back_inserter(_step), [](size_t s) { return s; });
|
||||
}
|
||||
else {
|
||||
std::transform(inputs[i-1].size.p, inputs[i-1].size.p + inputs[i-1].dims, std::back_inserter(_size), [](int s) { return s; });
|
||||
std::transform(inputs[i-1].step.p, inputs[i-1].step.p + inputs[i-1].dims, std::back_inserter(_step), [](size_t s) { return s; });
|
||||
}
|
||||
orig_shapes.push_back(_size);
|
||||
orig_steps.push_back(_step);
|
||||
|
||||
int esz = i == 0 ? outputs[0].elemSize() : inputs[i - 1].elemSize();
|
||||
elemsize.push_back(esz);
|
||||
}
|
||||
}
|
||||
|
||||
void reInit(size_t newElemSize) {
|
||||
std::vector<size_t> newElemSizes(elemsize.size(), newElemSize);
|
||||
reInit(newElemSizes);
|
||||
}
|
||||
|
||||
void reInit(std::vector<size_t> newElemSizes) {
|
||||
for (size_t array_index = 0; array_index < orig_steps.size(); array_index++) {
|
||||
auto &step = orig_steps[array_index];
|
||||
int esz = elemsize[array_index];
|
||||
int new_esz = newElemSizes[array_index];
|
||||
for (size_t step_index = 0; step_index < step.size(); step_index++) {
|
||||
step[step_index] = static_cast<size_t>(step[step_index] / esz * new_esz);
|
||||
}
|
||||
elemsize[array_index] = newElemSizes[array_index];
|
||||
}
|
||||
prepare_for_broadcast_op();
|
||||
}
|
||||
|
||||
bool prepare_for_broadcast_op()
|
||||
{
|
||||
int i, j, k;
|
||||
|
||||
// step 1.
|
||||
// * make all inputs and the output max_ndims-dimensional.
|
||||
// ** prepend dimension 1 to the mat of less dims
|
||||
// * compute proper step's
|
||||
for (i = this->max_ndims-1; i >= 0; i--) {
|
||||
for (k = 0; k < this->narrays; k++) {
|
||||
j = this->all_ndims[k] - (this->max_ndims - i);
|
||||
int sz_i = j >= 0 ? this->orig_shapes[k][j] : 1;
|
||||
size_t st_i = j >= 0 && this->orig_steps[k][j] > 0 ? this->orig_steps[k][j] :
|
||||
i == this->max_ndims-1 ? elemsize[k] : this->steps[k][i+1]*this->shapes[k][i+1];
|
||||
assert(st_i % elemsize[k] == 0);
|
||||
this->shapes[k][i] = sz_i;
|
||||
this->steps[k][i] = st_i;
|
||||
if (this->shapes[k][i] == 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// step 3. Let's do the flattening first,
|
||||
// since we'd need proper values of steps to check continuity.
|
||||
// this loop is probably the most tricky part
|
||||
// in the whole implementation of broadcasting.
|
||||
j = this->max_ndims > 0 ? this->max_ndims-1 : 0;
|
||||
for (i = j - 1; i >= 0; i--) {
|
||||
bool all_contiguous = true, all_scalars = true, all_consistent = true;
|
||||
for(k = 0; k < this->narrays; k++) {
|
||||
size_t st = this->steps[k][j]*this->shapes[k][j];
|
||||
bool prev_scalar = this->shapes[k][j] == 1;
|
||||
bool scalar = this->shapes[k][i] == 1;
|
||||
all_contiguous = all_contiguous && (st == this->steps[k][i]);
|
||||
all_scalars = all_scalars && scalar;
|
||||
all_consistent = all_consistent && (scalar == prev_scalar);
|
||||
}
|
||||
if (all_contiguous && (all_consistent || all_scalars)) {
|
||||
for(k = 0; k < this->narrays; k++)
|
||||
this->shapes[k][j] *= this->shapes[k][i];
|
||||
} else {
|
||||
j--;
|
||||
if (i < j) {
|
||||
for(k = 0; k < this->narrays; k++) {
|
||||
this->shapes[k][j] = this->shapes[k][i];
|
||||
this->steps[k][j] = this->steps[k][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2. Set some step's to 0's.
|
||||
for (i = this->max_ndims-1; i >= j; i--) {
|
||||
for (k = 0; k < this->narrays; k++)
|
||||
this->steps[k][i] = this->shapes[k][i] == 1 ? 0 : this->steps[k][i];
|
||||
}
|
||||
if (this->max_ndims == 0)
|
||||
i = 0;
|
||||
for (; i >= 0; i--) {
|
||||
for (k = 0; k < this->narrays; k++) {
|
||||
this->steps[k][i] = 0;
|
||||
this->shapes[k][i] = 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
|
||||
{
|
||||
NaryEltwiseHelper helper;
|
||||
public:
|
||||
enum class OPERATION
|
||||
{
|
||||
@ -130,6 +286,13 @@ public:
|
||||
op == OPERATION::MOD ||
|
||||
op == OPERATION::FMOD
|
||||
);
|
||||
|
||||
#ifdef HAVE_VULKAN
|
||||
if (backendId == DNN_BACKEND_VKCOM)
|
||||
return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
|
||||
op == OPERATION::DIV ;
|
||||
#endif
|
||||
|
||||
if (backendId == DNN_BACKEND_CUDA) {
|
||||
return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
|
||||
op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD ||
|
||||
@ -166,74 +329,14 @@ public:
|
||||
return outShape;
|
||||
}
|
||||
|
||||
static bool prepare_for_broadcast_op(
|
||||
int narrays, int max_ndims, const size_t* elemsize,
|
||||
const int* ndims, const int** shape_, const size_t** step_,
|
||||
int** shape, size_t** step)
|
||||
{
|
||||
int i, j, k;
|
||||
|
||||
// step 1.
|
||||
// * make all inputs and the output max_ndims-dimensional.
|
||||
// ** prepend dimension 1 to the mat of less dims
|
||||
// * compute proper step's
|
||||
for (i = max_ndims-1; i >= 0; i-- ) {
|
||||
for (k = 0; k < narrays; k++) {
|
||||
j = ndims[k] - (max_ndims - i);
|
||||
int sz_i = j >= 0 ? shape_[k][j] : 1;
|
||||
size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] :
|
||||
i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1];
|
||||
assert(st_i % elemsize[k] == 0);
|
||||
shape[k][i] = sz_i;
|
||||
step[k][i] = st_i;
|
||||
if (shape[k][i] == 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
|
||||
std::vector<Mat> inputs, outputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
|
||||
// step 3. Let's do the flattening first,
|
||||
// since we'd need proper values of steps to check continuity.
|
||||
// this loop is probably the most tricky part
|
||||
// in the whole implementation of broadcasting.
|
||||
j = max_ndims > 0 ? max_ndims-1 : 0;
|
||||
for (i = j - 1; i >= 0; i--) {
|
||||
bool all_contiguous = true, all_scalars = true, all_consistent = true;
|
||||
for(k = 0; k < narrays; k++) {
|
||||
size_t st = step[k][j]*shape[k][j];
|
||||
bool prev_scalar = shape[k][j] == 1;
|
||||
bool scalar = shape[k][i] == 1;
|
||||
all_contiguous = all_contiguous && (st == step[k][i]);
|
||||
all_scalars = all_scalars && scalar;
|
||||
all_consistent = all_consistent && (scalar == prev_scalar);
|
||||
}
|
||||
if (all_contiguous && (all_consistent || all_scalars)) {
|
||||
for(k = 0; k < narrays; k++)
|
||||
shape[k][j] *= shape[k][i];
|
||||
} else {
|
||||
j--;
|
||||
if (i < j) {
|
||||
for(k = 0; k < narrays; k++) {
|
||||
shape[k][j] = shape[k][i];
|
||||
step[k][j] = step[k][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2. Set some step's to 0's.
|
||||
for (i = max_ndims-1; i >= j; i--) {
|
||||
for (k = 0; k < narrays; k++)
|
||||
step[k][i] = shape[k][i] == 1 ? 0 : step[k][i];
|
||||
}
|
||||
if (max_ndims == 0)
|
||||
i = 0;
|
||||
for (; i >= 0; i--) {
|
||||
for (k = 0; k < narrays; k++) {
|
||||
step[k][i] = 0;
|
||||
shape[k][i] = 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
helper.init(inputs, outputs);
|
||||
CV_Assert(helper.prepare_for_broadcast_op());
|
||||
}
|
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs,
|
||||
@ -248,10 +351,10 @@ public:
|
||||
|
||||
template <typename T, typename Functor>
|
||||
void binary_forward_impl(
|
||||
int ndims, const int* shape,
|
||||
const char* data1, const size_t* step1,
|
||||
const char* data2, const size_t* step2,
|
||||
char* data, const size_t* step,
|
||||
int ndims, const std::vector<int>& shape,
|
||||
const char* data1, const std::vector<size_t>& step1,
|
||||
const char* data2, const std::vector<size_t>& step2,
|
||||
char* data, const std::vector<size_t>& step,
|
||||
const Functor& op)
|
||||
{
|
||||
size_t dp1 = 0, dp2 = 0, dp = 0;
|
||||
@ -320,52 +423,18 @@ public:
|
||||
const Mat& a = inputs[0];
|
||||
const Mat& b = inputs[1];
|
||||
Mat& out = outputs[0];
|
||||
|
||||
// collect info of inputs and output
|
||||
const int* in_shape[] = {a.size.p, b.size.p};
|
||||
const size_t* in_step[] = {a.step.p, b.step.p};
|
||||
const int* out_shape = out.size.p;
|
||||
const size_t* out_step = out.step.p;
|
||||
const int in_ndims[] = {a.dims, b.dims};
|
||||
int out_ndims = out.dims;
|
||||
|
||||
int max_ndims = std::max(a.dims, std::max(b.dims, out.dims));
|
||||
|
||||
const int* orig_shapes[3];
|
||||
int shapes_[3][CV_MAX_DIM];
|
||||
int* shapes[] = {shapes_[0], shapes_[1], shapes_[2]};
|
||||
const size_t* orig_steps[3];
|
||||
size_t steps_[3][CV_MAX_DIM];
|
||||
size_t* steps[] = {steps_[0], steps_[1], steps_[2]};
|
||||
int all_ndims[3];
|
||||
size_t all_type_sizes[3];
|
||||
|
||||
// assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
orig_shapes[i] = (const int*)(i == 0 ? out_shape : in_shape[i-1]);
|
||||
orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
|
||||
all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
|
||||
all_type_sizes[i] = sizeof(T);
|
||||
}
|
||||
|
||||
if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes,
|
||||
all_ndims, (const int**)orig_shapes,
|
||||
(const size_t**)orig_steps,
|
||||
shapes, steps))
|
||||
return;
|
||||
|
||||
CV_Assert(helper.shapes.size() == 3 && helper.steps.size() == 3);
|
||||
binary_forward_impl<T, Functor>(
|
||||
max_ndims, shapes[0], a.ptr<char>(), steps[1],
|
||||
b.ptr<char>(), steps[2], out.ptr<char>(), steps[0],
|
||||
helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1],
|
||||
b.ptr<char>(), helper.steps[2], out.ptr<char>(), helper.steps[0],
|
||||
f);
|
||||
}
|
||||
|
||||
template<typename T, typename Functor>
|
||||
void nary_forward_impl(
|
||||
const Functor& f, const T scale, int ninputs, int ndims, const int* shape,
|
||||
const Functor& f, const T scale, int ninputs, int ndims, const std::vector<int>& shape,
|
||||
const char** inp, char* out,
|
||||
const size_t** steps, char** ptrs)
|
||||
const std::vector<std::vector<size_t>>& steps, std::vector<char*>& ptrs)
|
||||
{
|
||||
CV_Assert(ndims >= 2);
|
||||
size_t dp = steps[0][ndims-1]/sizeof(T);
|
||||
@ -450,77 +519,16 @@ public:
|
||||
const std::vector<Mat>& inputs, std::vector<Mat>& outputs
|
||||
)
|
||||
{
|
||||
int ninputs = inputs.size();
|
||||
|
||||
// collect all input
|
||||
// collect all input info
|
||||
std::vector<const char*> v_inp;
|
||||
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr<const char>(); });
|
||||
const char** inp = v_inp.data();
|
||||
|
||||
// collect ndims of all input
|
||||
std::vector<int> v_inp_dims;
|
||||
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
|
||||
const int* inp_ndims = v_inp_dims.data();
|
||||
|
||||
// collect shapes of all input
|
||||
std::vector<const int*> v_inp_shape;
|
||||
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; });
|
||||
const int** inp_shape = v_inp_shape.data();
|
||||
|
||||
// collect steps of all input
|
||||
std::vector<const size_t*> v_inp_step;
|
||||
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; });
|
||||
const size_t** inp_step = v_inp_step.data();
|
||||
|
||||
// collect info of output (ndims, shape, step)
|
||||
// collect output info
|
||||
char* out = outputs[0].ptr<char>();
|
||||
int out_ndims = outputs[0].dims;
|
||||
const int* out_shape = outputs[0].size.p;
|
||||
const size_t* out_step = outputs[0].step.p;
|
||||
|
||||
// find max ndims for broadcasting
|
||||
int i, max_ndims = out_ndims > 2 ? out_ndims : 2;
|
||||
for(i = 0; i < ninputs; i++)
|
||||
max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
|
||||
|
||||
// buf holds the following buffers for inputs & output:
|
||||
// * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total
|
||||
// * ptrs, (ninputs+1)*1 elements in total
|
||||
// * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total
|
||||
// * all_ndims, (ninputs+1)*1 elements in total
|
||||
// * all_type_sizes, (ninputs+1)*1 elements in total
|
||||
AutoBuffer<size_t> buf((ninputs + 1) * (2 * max_ndims + 7));
|
||||
|
||||
int** orig_shapes = (int**)buf.data();
|
||||
int** shapes = orig_shapes + ninputs + 1;
|
||||
size_t** orig_steps = (size_t**)(shapes + ninputs + 1);
|
||||
size_t** steps = orig_steps + ninputs + 1;
|
||||
|
||||
char** ptrs = (char**)(steps + ninputs + 1);
|
||||
|
||||
size_t* step_buf = (size_t*)(ptrs + ninputs + 1);
|
||||
int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims);
|
||||
|
||||
int* all_ndims = shape_buf + (ninputs + 1)*max_ndims;
|
||||
size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1);
|
||||
|
||||
for(i = 0; i <= ninputs; i++) {
|
||||
all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1];
|
||||
all_type_sizes[i] = sizeof(T);
|
||||
orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0);
|
||||
orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0);
|
||||
shapes[i] = shape_buf + max_ndims*i;
|
||||
steps[i] = step_buf + max_ndims*i;
|
||||
}
|
||||
|
||||
if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes,
|
||||
all_ndims, (const int**)orig_shapes,
|
||||
(const size_t**)orig_steps,
|
||||
shapes, steps))
|
||||
return;
|
||||
|
||||
nary_forward_impl<T>(
|
||||
f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs);
|
||||
f, scale, helper.ninputs, helper.max_ndims, helper.shapes[0], inp, out, helper.steps, helper.ptrs);
|
||||
}
|
||||
|
||||
template <typename T, typename Functor>
|
||||
@ -531,59 +539,21 @@ public:
|
||||
const Mat& c = inputs[2];
|
||||
Mat& out = outputs[0];
|
||||
|
||||
// collect info of inputs and output
|
||||
const int* in_shape[] = {a.size.p, b.size.p, c.size.p};
|
||||
const size_t* in_step[] = {a.step.p, b.step.p, c.step.p};
|
||||
const int* out_shape = out.size.p;
|
||||
const size_t* out_step = out.step.p;
|
||||
const int in_ndims[] = {a.dims, b.dims, c.dims};
|
||||
int out_ndims = out.dims;
|
||||
|
||||
int max_ndims = std::max(a.dims, std::max(b.dims, std::max(c.dims, out.dims)));
|
||||
|
||||
AutoBuffer<size_t> buf(4 * (2 * max_ndims + 6));
|
||||
|
||||
int** orig_shapes = (int**)(buf.data());
|
||||
int** shapes = orig_shapes + 4;
|
||||
size_t** orig_steps = (size_t**)(shapes + 4);
|
||||
size_t** steps = orig_steps + 4;
|
||||
|
||||
int* shape_buf = (int*)(steps + 4);
|
||||
size_t* step_buf = (size_t*)(shape_buf + 4 * max_ndims);
|
||||
|
||||
int* all_ndims = (int*)(step_buf + 4 * max_ndims);
|
||||
size_t* all_type_sizes = (size_t*)(all_ndims + 4);
|
||||
|
||||
// assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
|
||||
orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
|
||||
shapes[i] = shape_buf + i * max_ndims;
|
||||
steps[i] = step_buf + i * max_ndims;
|
||||
all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
|
||||
all_type_sizes[i] = sizeof(T);
|
||||
}
|
||||
|
||||
if (!prepare_for_broadcast_op(4, max_ndims, all_type_sizes,
|
||||
all_ndims, (const int**)orig_shapes,
|
||||
(const size_t**)orig_steps,
|
||||
shapes, steps))
|
||||
return;
|
||||
CV_Assert(helper.shapes.size() == 4 && helper.steps.size() == 4);
|
||||
|
||||
trinary_forward_impl<T, Functor>(
|
||||
max_ndims, shapes[0], a.ptr<char>(), steps[1], b.ptr<char>(), steps[2],
|
||||
c.ptr<char>(), steps[3], out.ptr<char>(), steps[0],
|
||||
helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1], b.ptr<char>(), helper.steps[2],
|
||||
c.ptr<char>(), helper.steps[3], out.ptr<char>(), helper.steps[0],
|
||||
f);
|
||||
}
|
||||
|
||||
template <typename T, typename Functor>
|
||||
void trinary_forward_impl(
|
||||
int ndims, const int* shape,
|
||||
const char* data1, const size_t* step1,
|
||||
const char* data2, const size_t* step2,
|
||||
const char* data3, const size_t* step3,
|
||||
char* data, const size_t* step,
|
||||
int ndims, const std::vector<int>& shape,
|
||||
const char* data1, const std::vector<size_t>& step1,
|
||||
const char* data2, const std::vector<size_t>& step2,
|
||||
const char* data3, const std::vector<size_t>& step3,
|
||||
char* data, const std::vector<size_t>& step,
|
||||
const Functor& op)
|
||||
{
|
||||
assert(ndims >= 2);
|
||||
@ -642,8 +612,9 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
helper.reInit(sizeof(float));
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
@ -798,9 +769,13 @@ public:
|
||||
switch (type)
|
||||
{
|
||||
case CV_8U:
|
||||
// TODO: integrate with type inference
|
||||
helper.reInit(sizeof(uint8_t));
|
||||
opDispatch<uint8_t>(std::forward<Args>(args)...);
|
||||
break;
|
||||
case CV_32S:
|
||||
// TODO: integrate with type inference
|
||||
helper.reInit(sizeof(int32_t));
|
||||
opDispatch<int32_t>(std::forward<Args>(args)...);
|
||||
break;
|
||||
case CV_32F:
|
||||
@ -958,6 +933,16 @@ public:
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(node));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_VULKAN
|
||||
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
|
||||
std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE
|
||||
{
|
||||
Ptr<vkcom::OpBase> op = makePtr<vkcom::OpNary>((vkcom::OpNary::OPERATION) this->op, helper.ninputs, helper.max_ndims, helper.shapes, helper.steps);
|
||||
return Ptr<BackendNode>(makePtr<VkComBackendNode>(inputs, op, outputs));
|
||||
}
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
Ptr<NaryEltwiseLayer> NaryEltwiseLayer::create(const LayerParams& params)
|
||||
|
@ -112,7 +112,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
@ -193,7 +193,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -127,17 +127,7 @@ public:
|
||||
|
||||
if (paddingType == "constant")
|
||||
{
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
std::vector<float> paddingValue_fp32(1, paddingValue);
|
||||
std::vector<int16_t> paddingValue_fp16(1);
|
||||
cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
|
||||
outputs[0].setTo(paddingValue_fp16[0]);
|
||||
}
|
||||
else if (inputs_arr.depth() == CV_8S)
|
||||
outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
|
||||
else
|
||||
outputs[0].setTo(paddingValue);
|
||||
outputs[0].setTo(paddingValue);
|
||||
inputs[0].copyTo(outputs[0](dstRanges));
|
||||
}
|
||||
else if (paddingType == "reflect" || paddingType == "edge")
|
||||
|
@ -319,7 +319,7 @@ public:
|
||||
mnew_stride.copyTo(unew_stride);
|
||||
}
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
String opts = format("-DDtype=%s", use_half ? "half" : "float");
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
@ -350,7 +350,7 @@ public:
|
||||
inputs_arr.depth() != CV_8S,
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -278,7 +278,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -338,7 +338,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
}
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -346,7 +346,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
bool use_half = (inps.depth() == CV_16F);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -431,7 +431,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -186,7 +186,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
if (inputs_.depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
@ -269,7 +269,7 @@ public:
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -390,7 +390,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -906,7 +906,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
@ -1066,7 +1066,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -456,7 +456,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -161,7 +161,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
// TODO: implement a logistic activation to classification scores.
|
||||
if (useLogistic || inps.depth() == CV_16S)
|
||||
if (useLogistic || inps.depth() == CV_16F)
|
||||
return false;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
@ -232,7 +232,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -184,7 +184,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -115,7 +115,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -105,7 +105,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -74,7 +74,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S) {
|
||||
if (inputs_arr.depth() == CV_16F) {
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S) {
|
||||
if (inputs_arr.depth() == CV_16F) {
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -621,7 +621,7 @@ public:
|
||||
{
|
||||
std::vector<int> inpIdx(dimsNum, 0);
|
||||
std::vector<int> outIdx(dimsNum, 0);
|
||||
if (inpMat.type() == CV_16S)
|
||||
if (inpMat.type() == CV_16F)
|
||||
getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
|
||||
else if (inpMat.type() == CV_8S)
|
||||
getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
|
||||
|
@ -130,7 +130,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
bool use_half = (inputs_.depth() == CV_16F);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -215,7 +215,7 @@ public:
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -501,7 +501,7 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes)
|
||||
CV_Assert(layerShapesIt != layersShapes.end());
|
||||
|
||||
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
|
||||
ld.dtype = CV_16S;
|
||||
ld.dtype = CV_16F;
|
||||
|
||||
std::vector<LayerPin> pinsForInternalBlobs;
|
||||
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
|
||||
@ -559,7 +559,7 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16 &&
|
||||
layers[0].dtype == CV_32F)
|
||||
{
|
||||
layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
|
||||
layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F);
|
||||
}
|
||||
inputShapes.push_back(shape(inp));
|
||||
}
|
||||
@ -643,8 +643,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
|
||||
{
|
||||
UMat& u = umat_outputBlobs[i];
|
||||
Mat m;
|
||||
if (u.depth() == CV_16S) // FP16
|
||||
convertFp16(u, m);
|
||||
if (u.depth() == CV_16F) // FP16
|
||||
u.convertTo(m, CV_32F);
|
||||
else
|
||||
m = u.getMat(ACCESS_READ);
|
||||
if (!checkRange(m))
|
||||
@ -666,8 +666,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
|
||||
{
|
||||
UMat& u = umat_inputBlobs[i];
|
||||
Mat m;
|
||||
if (u.depth() == CV_16S) // FP16
|
||||
convertFp16(u, m);
|
||||
if (u.depth() == CV_16F) // FP16
|
||||
u.convertTo(m, CV_32F);
|
||||
else
|
||||
m = u.getMat(ACCESS_READ);
|
||||
std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
|
||||
@ -677,8 +677,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
|
||||
{
|
||||
UMat& u = umat_outputBlobs[i];
|
||||
Mat m;
|
||||
if (u.depth() == CV_16S) // FP16
|
||||
convertFp16(u, m);
|
||||
if (u.depth() == CV_16F) // FP16
|
||||
u.convertTo(m, CV_32F);
|
||||
else
|
||||
m = u.getMat(ACCESS_READ);
|
||||
std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
|
||||
@ -688,8 +688,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
|
||||
{
|
||||
UMat& u = umat_internalBlobs[i];
|
||||
Mat m;
|
||||
if (u.depth() == CV_16S) // FP16
|
||||
convertFp16(u, m);
|
||||
if (u.depth() == CV_16F) // FP16
|
||||
u.convertTo(m, CV_32F);
|
||||
else
|
||||
m = u.getMat(ACCESS_READ);
|
||||
std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
|
||||
@ -964,12 +964,12 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
|
||||
ld.outputBlobsWrappers[i]->copyToHost();
|
||||
}
|
||||
}
|
||||
if (ld.outputBlobs[0].depth() == CV_16S)
|
||||
if (ld.outputBlobs[0].depth() == CV_16F)
|
||||
{
|
||||
std::vector<Mat>& outputvec = *(std::vector<Mat>*)outputBlobs.getObj();
|
||||
outputvec.resize(ld.outputBlobs.size());
|
||||
for (int i = 0; i < outputvec.size(); i++)
|
||||
convertFp16(ld.outputBlobs[i], outputvec[i]);
|
||||
ld.outputBlobs[i].convertTo(outputvec[i], CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -992,7 +992,7 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
|
||||
std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
outputvec.resize(out_vec.size());
|
||||
for (int i = 0; i < out_vec.size(); i++)
|
||||
convertFp16(out_vec[i], outputvec[i]);
|
||||
out_vec[i].convertTo(outputvec[i], CV_32F);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -1258,7 +1258,7 @@ void Net::Impl::updateLayersShapes()
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16 &&
|
||||
inputLayerData.dtype == CV_32F)
|
||||
{
|
||||
inp.create(inp.dims, inp.size, CV_16S);
|
||||
inp.create(inp.dims, inp.size, CV_16F);
|
||||
}
|
||||
inputShapes.push_back(shape(inp));
|
||||
}
|
||||
@ -1327,10 +1327,10 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
|
||||
ld.outputBlobsWrappers[pin.oid]->copyToHost();
|
||||
}
|
||||
|
||||
if (ld.outputBlobs[pin.oid].depth() == CV_16S)
|
||||
if (ld.outputBlobs[pin.oid].depth() == CV_16F)
|
||||
{
|
||||
Mat output_blob;
|
||||
convertFp16(ld.outputBlobs[pin.oid], output_blob);
|
||||
ld.outputBlobs[pin.oid].convertTo(output_blob, CV_32F);
|
||||
return output_blob;
|
||||
}
|
||||
else
|
||||
|
@ -156,7 +156,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
|
||||
gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
|
||||
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
bool halfPrecisionMode = (A.depth() == CV_16F);
|
||||
|
||||
if (is_image_a)
|
||||
{
|
||||
@ -439,7 +439,7 @@ static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
|
||||
<< "Invalid fast buffer gemm type." << std::endl;
|
||||
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
bool halfPrecisionMode = (A.depth() == CV_16F);
|
||||
|
||||
size_t sub_group_size = 8;
|
||||
bool is_small_batch = (M == 2 || M == 4 || M == 8);
|
||||
@ -544,7 +544,7 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
|
||||
const UMat B_image, UMat C,
|
||||
const size_t max_image_size)
|
||||
{
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
bool halfPrecisionMode = (A.depth() == CV_16F);
|
||||
gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
|
||||
|
||||
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
|
||||
@ -594,7 +594,7 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
|
||||
const int32_t offy)
|
||||
{
|
||||
bool ret = false;
|
||||
bool use_half = (A.depth() == CV_16S);
|
||||
bool use_half = (A.depth() == CV_16F);
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
|
||||
@ -665,7 +665,7 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
|
||||
const UMat X, const int32_t offX, UMat Y,
|
||||
const int32_t offY)
|
||||
{
|
||||
bool use_half = (X.depth() == CV_16S);
|
||||
bool use_half = (X.depth() == CV_16F);
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
|
||||
|
@ -582,10 +582,10 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
|
||||
}
|
||||
|
||||
if (use_half_ && !bias.empty())
|
||||
CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
|
||||
CV_CheckTypeEQ(bias.type(), CV_16FC1, "");
|
||||
|
||||
if (use_half_)
|
||||
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||
CV_CheckTypeEQ(weight.type(), CV_16FC1, "");
|
||||
|
||||
prepareKernel(bottom, top, weight, bias, numImages);
|
||||
if (bestKernelConfig.empty())
|
||||
@ -740,7 +740,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
if (swizzled_weights_umat.empty())
|
||||
swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
|
||||
kernel_h_ * (int)alignSize(kernel_w_, 2),
|
||||
(use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
(use_half_) ? CV_16FC1 : CV_32FC1);
|
||||
|
||||
if (!interleave) {
|
||||
int32_t channels = channels_ / group_;
|
||||
@ -777,8 +777,8 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
|
||||
if (use_half_)
|
||||
{
|
||||
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||
convertFp16(weight, weight_tmp);
|
||||
CV_CheckTypeEQ(weight.type(), CV_16FC1, "");
|
||||
weight.convertTo(weight_tmp, CV_32F);
|
||||
weightMat = weight_tmp.getMat(ACCESS_READ);
|
||||
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
|
||||
}
|
||||
@ -817,7 +817,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
weightMat.release();
|
||||
|
||||
if (use_half_)
|
||||
convertFp16(swizzledWeightMat, swizzled_weights_umat);
|
||||
swizzledWeightMat.convertTo(swizzled_weights_umat, CV_16F);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1140,7 +1140,7 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
|
||||
|
||||
//int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
|
||||
CV_CheckEQ(top.total(), (size_t)numImages * num_output_ * output_h_ * output_w_, "");
|
||||
CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16SC1 : CV_32FC1, "");
|
||||
CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16FC1 : CV_32FC1, "");
|
||||
top.setTo(Scalar::all(0));
|
||||
|
||||
bool saved_tuned = tuned_;
|
||||
@ -1154,8 +1154,8 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
|
||||
Mat mat_top, mat_verify_top;
|
||||
if (use_half_)
|
||||
{
|
||||
convertFp16(top, new_top);
|
||||
convertFp16(verifyTop, new_verify_top);
|
||||
top.convertTo(new_top, CV_32F);
|
||||
verifyTop.convertTo(new_verify_top, CV_32F);
|
||||
|
||||
mat_top = new_top.getMat(ACCESS_READ);
|
||||
mat_verify_top = new_verify_top.getMat(ACCESS_READ);
|
||||
@ -1827,7 +1827,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
|
||||
if (loadTunedConfig()) // check external storage
|
||||
return;
|
||||
|
||||
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16FC1 : CV_32FC1);
|
||||
|
||||
calculateBenchmark(bottom, benchData, weight, bias, numImages);
|
||||
|
||||
|
@ -102,10 +102,10 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
|
||||
UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
|
||||
UMat newbias, tmpTop;
|
||||
|
||||
convertFp16(bias, newbias);
|
||||
convertFp16(top, tmpTop);
|
||||
bias.convertTo(newbias, CV_32F);
|
||||
top.convertTo(tmpTop, CV_32F);
|
||||
cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
|
||||
convertFp16(tmpTop, top);
|
||||
tmpTop.convertTo(top, CV_16F);
|
||||
} else {
|
||||
UMat biasOnesMat = UMat::ones(M_, 1, CV_32F);
|
||||
cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0);
|
||||
|
@ -2447,7 +2447,7 @@ void ONNXImporter::parseCast(LayerParams& layerParams, const opencv_onnx::NodePr
|
||||
case opencv_onnx::TensorProto_DataType_FLOAT: type = CV_32F; break;
|
||||
case opencv_onnx::TensorProto_DataType_UINT8: type = CV_8U; break;
|
||||
case opencv_onnx::TensorProto_DataType_UINT16: type = CV_16U; break;
|
||||
case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16S; break;
|
||||
case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16F; break;
|
||||
case opencv_onnx::TensorProto_DataType_INT8:
|
||||
case opencv_onnx::TensorProto_DataType_INT16:
|
||||
case opencv_onnx::TensorProto_DataType_INT32:
|
||||
|
@ -915,22 +915,22 @@ Mat getTensorContentRef_(const tensorflow::TensorProto& tensor)
|
||||
}
|
||||
case tensorflow::DT_HALF:
|
||||
{
|
||||
Mat halfs;
|
||||
if (!content.empty())
|
||||
{
|
||||
static const int kHalfSize = 2;
|
||||
halfs = Mat(1, content.size() / kHalfSize, CV_16UC1, (void*)content.c_str());
|
||||
Mat halfs(1, content.size() / kHalfSize, CV_16FC1, (void*)content.c_str());
|
||||
halfs.convertTo(m, CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
const RepeatedField<int32_t>& field = tensor.half_val();
|
||||
CV_Assert(!field.empty());
|
||||
Mat ints(1, field.size(), CV_32SC1, (void*)field.data());
|
||||
Mat halfs;
|
||||
ints.convertTo(halfs, CV_16UC1);
|
||||
Mat halfsSigned(halfs.size(), CV_16FC1, halfs.data);
|
||||
halfsSigned.convertTo(m, CV_32F);
|
||||
}
|
||||
// Reinterpret as a signed shorts just for a convertFp16 call.
|
||||
Mat halfsSigned(halfs.size(), CV_16SC1, halfs.data);
|
||||
convertFp16(halfsSigned, m);
|
||||
break;
|
||||
}
|
||||
case tensorflow::DT_QUINT8:
|
||||
|
@ -101,7 +101,7 @@ Mat TFLiteImporter::parseTensor(const Tensor& tensor)
|
||||
dtype = CV_32S;
|
||||
break;
|
||||
case TensorType_FLOAT16:
|
||||
dtype = CV_16S;
|
||||
dtype = CV_16F;
|
||||
break;
|
||||
case TensorType_INT8:
|
||||
dtype = CV_8S;
|
||||
@ -227,7 +227,7 @@ void TFLiteImporter::populateNet()
|
||||
if (!data.empty()) {
|
||||
// Dequantize a buffer
|
||||
Mat dataFP32;
|
||||
convertFp16(data, dataFP32);
|
||||
data.convertTo(dataFP32, CV_32F);
|
||||
allTensors[op_outputs->Get(0)] = dataFP32;
|
||||
continue;
|
||||
}
|
||||
|
87
modules/dnn/src/vkcom/include/op_naryeltwise.hpp
Normal file
87
modules/dnn/src/vkcom/include/op_naryeltwise.hpp
Normal file
@ -0,0 +1,87 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_OP_NARY_HPP
|
||||
#define OPENCV_OP_NARY_HPP
|
||||
|
||||
#include "vkcom.hpp"
|
||||
#include "op_base.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace vkcom {
|
||||
|
||||
#ifdef HAVE_VULKAN
|
||||
|
||||
enum NaryShaderType
|
||||
{
|
||||
kNaryShaderTypeBinary,
|
||||
kNaryShaderTypeTrinary,
|
||||
kNaryShaderTypeNary,
|
||||
kNaryShaderTest,
|
||||
};
|
||||
|
||||
struct NaryShaderConfig
|
||||
{
|
||||
int local_size_x;
|
||||
int local_size_y;
|
||||
int local_size_z;
|
||||
};
|
||||
|
||||
|
||||
class OpNary : public OpBase
|
||||
{
|
||||
public:
|
||||
// Copied from nary_eltwise_layers.cpp
|
||||
enum class OPERATION
|
||||
{
|
||||
AND = 0,
|
||||
EQUAL,
|
||||
GREATER,
|
||||
GREATER_EQUAL,
|
||||
LESS,
|
||||
LESS_EQUAL,
|
||||
OR,
|
||||
POW,
|
||||
XOR,
|
||||
BITSHIFT,
|
||||
MAX,
|
||||
MEAN,
|
||||
MIN,
|
||||
MOD,
|
||||
PROD,
|
||||
SUB,
|
||||
SUM,
|
||||
ADD,
|
||||
DIV,
|
||||
WHERE,
|
||||
};
|
||||
|
||||
OpNary(const OPERATION naryOpType, int ninputs, int max_ndims, const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps);
|
||||
|
||||
void firstForward(); // Execute only in the first forward.
|
||||
virtual bool forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs) CV_OVERRIDE;
|
||||
Ptr<Tensor> weightTensorPtr;
|
||||
private:
|
||||
bool computeGroupCount();
|
||||
bool binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
|
||||
bool trinaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
|
||||
bool naryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
|
||||
|
||||
const OPERATION naryOpType;
|
||||
NaryShaderType shaderType;
|
||||
NaryShaderConfig config;
|
||||
int ninputs;
|
||||
int max_ndims;
|
||||
AutoBuffer<int32_t> shapesBuf;
|
||||
AutoBuffer<int32_t> stepsBuf;
|
||||
int nplanes; // number of planes computations are to be performed on
|
||||
int N2; // value of shape[ndims - 2]
|
||||
int N1; // value of shape[ndims - 1]
|
||||
|
||||
bool firstForwardFinsh = false;
|
||||
};
|
||||
|
||||
#endif // HAVE_VULKAN
|
||||
|
||||
}}} // namespace cv::dnn::vkcom
|
||||
#endif //OPENCV_OP_MATMUL_HPP
|
@ -51,5 +51,6 @@ bool isAvailable();
|
||||
#include "op_base.hpp"
|
||||
#include "op_conv.hpp"
|
||||
#include "op_matmul.hpp"
|
||||
#include "op_naryeltwise.hpp"
|
||||
|
||||
#endif // OPENCV_DNN_VKCOM_HPP
|
||||
|
116
modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
Normal file
116
modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
Normal file
@ -0,0 +1,116 @@
|
||||
#version 450
|
||||
// #extension GL_EXT_debug_printf : enable
|
||||
#define ALL_THREAD 1024
|
||||
// #define ALL_THREAD 128 // Experimental batched operation
|
||||
#define STEP_SIZE 65536
|
||||
|
||||
layout(binding = 0) readonly buffer Input1{
|
||||
float matA[];
|
||||
};
|
||||
|
||||
layout(binding = 1) readonly buffer Input2{
|
||||
float matB[];
|
||||
};
|
||||
|
||||
layout(binding = 2) writeonly buffer Output{
|
||||
float matOut[];
|
||||
};
|
||||
|
||||
layout(binding = 3) uniform Params {
|
||||
int opType;
|
||||
int ndims;
|
||||
} params;
|
||||
|
||||
layout(binding = 4) readonly buffer Shape {
|
||||
int shape[];
|
||||
};
|
||||
|
||||
layout(binding = 5) readonly buffer Step {
|
||||
int matStep[];
|
||||
};
|
||||
|
||||
/* local_size_x, local_size_y, local_size_z there defines the number of invocations
|
||||
of this compute shader in the current work group. */
|
||||
// TODO: Check if this makes any sense
|
||||
// TODO: Check if it is required to fetch PhysicalDeviceLimit from Context
|
||||
// TODO: here we shall assume that maxGroupInvocation is 1024.
|
||||
layout(local_size_x = ALL_THREAD, local_size_y = 1, local_size_z = 1) in; // TODO: Check if this makes any sense
|
||||
|
||||
const int AND = 0;
|
||||
const int EQUAL = 1;
|
||||
const int GREATER = 2;
|
||||
const int GREATER_EQUAL = 3;
|
||||
const int LESS = 4;
|
||||
const int LESS_EQUAL = 5;
|
||||
const int OR = 6;
|
||||
const int POW = 7;
|
||||
const int XOR = 8;
|
||||
const int BITSHIFT = 9;
|
||||
const int MAX = 10;
|
||||
const int MEAN = 11;
|
||||
const int MIN = 12;
|
||||
const int MOD = 13;
|
||||
const int FMOD = 14;
|
||||
const int PROD = 15;
|
||||
const int SUB = 16;
|
||||
const int SUM = 17;
|
||||
const int ADD = 18;
|
||||
const int DIV = 19;
|
||||
const int WHERE = 20;
|
||||
|
||||
void binary_forward()
|
||||
{
|
||||
int ndims = params.ndims;
|
||||
int dp1 = matStep[2 * ndims - 1];
|
||||
int dp2 = matStep[3 * ndims - 1];
|
||||
int dp = matStep[ndims - 1];
|
||||
int n1 = shape[ndims - 1], n2 = shape[ndims - 2];
|
||||
|
||||
int plane_idx = int(gl_WorkGroupID.x);
|
||||
|
||||
int ptr1 = 0;
|
||||
int ptr2 = 0;
|
||||
int ptr = 0;
|
||||
int idx = plane_idx;
|
||||
|
||||
for (int k = ndims - 3; k >= 0; --k) {
|
||||
int next_idx = idx / shape[k];
|
||||
int i_k = idx - next_idx * shape[k]; // i_k = idx % shape[k]
|
||||
ptr1 += i_k * matStep[ndims + k];
|
||||
ptr2 += i_k * matStep[2 * ndims + k];
|
||||
ptr += i_k * matStep[k];
|
||||
idx = next_idx;
|
||||
}
|
||||
|
||||
int i2_offset = int(gl_WorkGroupID.y);
|
||||
int i1_offset = int(gl_LocalInvocationID.x);
|
||||
|
||||
ptr1 += i2_offset * matStep[2 * ndims - 2];
|
||||
ptr2 += i2_offset * matStep[3 * ndims - 2];
|
||||
ptr += i2_offset * matStep[ndims - 2];
|
||||
|
||||
for (int i1 = i1_offset; i1 < n1; i1 += ALL_THREAD) {
|
||||
switch (params.opType) {
|
||||
case int(ADD):
|
||||
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] + matB[ptr2 + i1 * dp2];
|
||||
break;
|
||||
case int(SUB):
|
||||
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] - matB[ptr2 + i1 * dp2];
|
||||
break;
|
||||
case int(PROD):
|
||||
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] * matB[ptr2 + i1 * dp2];
|
||||
break;
|
||||
case int(DIV):
|
||||
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] / matB[ptr2 + i1 * dp2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void main()
|
||||
{
|
||||
// debugPrintfEXT("nary_eltwise_binary_forward.comp loaded\n");
|
||||
binary_forward();
|
||||
return;
|
||||
}
|
232
modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
Normal file
232
modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
Normal file
@ -0,0 +1,232 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../../precomp.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace vkcom {
|
||||
|
||||
extern const unsigned int nary_eltwise_binary_forward_spv[1757] = {
|
||||
0x07230203,0x00010000,0x0008000b,0x00000131,0x00000000,0x00020011,0x00000001,0x0006000b,
|
||||
0x00000001,0x4c534c47,0x6474732e,0x3035342e,0x00000000,0x0003000e,0x00000000,0x00000001,
|
||||
0x0007000f,0x00000005,0x00000004,0x6e69616d,0x00000000,0x0000003c,0x00000083,0x00060010,
|
||||
0x00000004,0x00000011,0x00000400,0x00000001,0x00000001,0x00030003,0x00000002,0x000001c2,
|
||||
0x00040005,0x00000004,0x6e69616d,0x00000000,0x00060005,0x00000006,0x616e6962,0x665f7972,
|
||||
0x6177726f,0x00286472,0x00040005,0x0000000a,0x6d69646e,0x00000073,0x00040005,0x0000000b,
|
||||
0x61726150,0x0000736d,0x00050006,0x0000000b,0x00000000,0x7954706f,0x00006570,0x00050006,
|
||||
0x0000000b,0x00000001,0x6d69646e,0x00000073,0x00040005,0x0000000d,0x61726170,0x0000736d,
|
||||
0x00030005,0x00000012,0x00317064,0x00040005,0x00000014,0x70657453,0x00000000,0x00050006,
|
||||
0x00000014,0x00000000,0x5374616d,0x00706574,0x00030005,0x00000016,0x00000000,0x00030005,
|
||||
0x0000001e,0x00327064,0x00030005,0x00000025,0x00007064,0x00030005,0x0000002a,0x0000316e,
|
||||
0x00040005,0x0000002c,0x70616853,0x00000065,0x00050006,0x0000002c,0x00000000,0x70616873,
|
||||
0x00000065,0x00030005,0x0000002e,0x00000000,0x00030005,0x00000033,0x0000326e,0x00050005,
|
||||
0x00000038,0x6e616c70,0x64695f65,0x00000078,0x00060005,0x0000003c,0x575f6c67,0x476b726f,
|
||||
0x70756f72,0x00004449,0x00040005,0x00000042,0x31727470,0x00000000,0x00040005,0x00000043,
|
||||
0x32727470,0x00000000,0x00030005,0x00000044,0x00727470,0x00030005,0x00000045,0x00786469,
|
||||
0x00030005,0x00000047,0x0000006b,0x00050005,0x00000052,0x7478656e,0x7864695f,0x00000000,
|
||||
0x00030005,0x00000058,0x006b5f69,0x00050005,0x0000007d,0x6f5f3269,0x65736666,0x00000074,
|
||||
0x00050005,0x00000082,0x6f5f3169,0x65736666,0x00000074,0x00080005,0x00000083,0x4c5f6c67,
|
||||
0x6c61636f,0x6f766e49,0x69746163,0x44496e6f,0x00000000,0x00030005,0x000000a1,0x00003169,
|
||||
0x00040005,0x000000b4,0x7074754f,0x00007475,0x00050006,0x000000b4,0x00000000,0x4f74616d,
|
||||
0x00007475,0x00030005,0x000000b6,0x00000000,0x00040005,0x000000bd,0x75706e49,0x00003174,
|
||||
0x00050006,0x000000bd,0x00000000,0x4174616d,0x00000000,0x00030005,0x000000bf,0x00000000,
|
||||
0x00040005,0x000000c9,0x75706e49,0x00003274,0x00050006,0x000000c9,0x00000000,0x4274616d,
|
||||
0x00000000,0x00030005,0x000000cb,0x00000000,0x00050048,0x0000000b,0x00000000,0x00000023,
|
||||
0x00000000,0x00050048,0x0000000b,0x00000001,0x00000023,0x00000004,0x00030047,0x0000000b,
|
||||
0x00000002,0x00040047,0x0000000d,0x00000022,0x00000000,0x00040047,0x0000000d,0x00000021,
|
||||
0x00000003,0x00040047,0x00000013,0x00000006,0x00000004,0x00040048,0x00000014,0x00000000,
|
||||
0x00000018,0x00050048,0x00000014,0x00000000,0x00000023,0x00000000,0x00030047,0x00000014,
|
||||
0x00000003,0x00040047,0x00000016,0x00000022,0x00000000,0x00040047,0x00000016,0x00000021,
|
||||
0x00000005,0x00040047,0x0000002b,0x00000006,0x00000004,0x00040048,0x0000002c,0x00000000,
|
||||
0x00000018,0x00050048,0x0000002c,0x00000000,0x00000023,0x00000000,0x00030047,0x0000002c,
|
||||
0x00000003,0x00040047,0x0000002e,0x00000022,0x00000000,0x00040047,0x0000002e,0x00000021,
|
||||
0x00000004,0x00040047,0x0000003c,0x0000000b,0x0000001a,0x00040047,0x00000083,0x0000000b,
|
||||
0x0000001b,0x00040047,0x000000b3,0x00000006,0x00000004,0x00040048,0x000000b4,0x00000000,
|
||||
0x00000019,0x00050048,0x000000b4,0x00000000,0x00000023,0x00000000,0x00030047,0x000000b4,
|
||||
0x00000003,0x00040047,0x000000b6,0x00000022,0x00000000,0x00040047,0x000000b6,0x00000021,
|
||||
0x00000002,0x00040047,0x000000bc,0x00000006,0x00000004,0x00040048,0x000000bd,0x00000000,
|
||||
0x00000018,0x00050048,0x000000bd,0x00000000,0x00000023,0x00000000,0x00030047,0x000000bd,
|
||||
0x00000003,0x00040047,0x000000bf,0x00000022,0x00000000,0x00040047,0x000000bf,0x00000021,
|
||||
0x00000000,0x00040047,0x000000c8,0x00000006,0x00000004,0x00040048,0x000000c9,0x00000000,
|
||||
0x00000018,0x00050048,0x000000c9,0x00000000,0x00000023,0x00000000,0x00030047,0x000000c9,
|
||||
0x00000003,0x00040047,0x000000cb,0x00000022,0x00000000,0x00040047,0x000000cb,0x00000021,
|
||||
0x00000001,0x00040047,0x0000011f,0x0000000b,0x00000019,0x00020013,0x00000002,0x00030021,
|
||||
0x00000003,0x00000002,0x00040015,0x00000008,0x00000020,0x00000001,0x00040020,0x00000009,
|
||||
0x00000007,0x00000008,0x0004001e,0x0000000b,0x00000008,0x00000008,0x00040020,0x0000000c,
|
||||
0x00000002,0x0000000b,0x0004003b,0x0000000c,0x0000000d,0x00000002,0x0004002b,0x00000008,
|
||||
0x0000000e,0x00000001,0x00040020,0x0000000f,0x00000002,0x00000008,0x0003001d,0x00000013,
|
||||
0x00000008,0x0003001e,0x00000014,0x00000013,0x00040020,0x00000015,0x00000002,0x00000014,
|
||||
0x0004003b,0x00000015,0x00000016,0x00000002,0x0004002b,0x00000008,0x00000017,0x00000000,
|
||||
0x0004002b,0x00000008,0x00000018,0x00000002,0x0004002b,0x00000008,0x0000001f,0x00000003,
|
||||
0x0003001d,0x0000002b,0x00000008,0x0003001e,0x0000002c,0x0000002b,0x00040020,0x0000002d,
|
||||
0x00000002,0x0000002c,0x0004003b,0x0000002d,0x0000002e,0x00000002,0x00040015,0x00000039,
|
||||
0x00000020,0x00000000,0x00040017,0x0000003a,0x00000039,0x00000003,0x00040020,0x0000003b,
|
||||
0x00000001,0x0000003a,0x0004003b,0x0000003b,0x0000003c,0x00000001,0x0004002b,0x00000039,
|
||||
0x0000003d,0x00000000,0x00040020,0x0000003e,0x00000001,0x00000039,0x00020014,0x00000050,
|
||||
0x0004002b,0x00000039,0x0000007e,0x00000001,0x0004003b,0x0000003b,0x00000083,0x00000001,
|
||||
0x00030016,0x000000b2,0x00000020,0x0003001d,0x000000b3,0x000000b2,0x0003001e,0x000000b4,
|
||||
0x000000b3,0x00040020,0x000000b5,0x00000002,0x000000b4,0x0004003b,0x000000b5,0x000000b6,
|
||||
0x00000002,0x0003001d,0x000000bc,0x000000b2,0x0003001e,0x000000bd,0x000000bc,0x00040020,
|
||||
0x000000be,0x00000002,0x000000bd,0x0004003b,0x000000be,0x000000bf,0x00000002,0x00040020,
|
||||
0x000000c5,0x00000002,0x000000b2,0x0003001d,0x000000c8,0x000000b2,0x0003001e,0x000000c9,
|
||||
0x000000c8,0x00040020,0x000000ca,0x00000002,0x000000c9,0x0004003b,0x000000ca,0x000000cb,
|
||||
0x00000002,0x0004002b,0x00000008,0x00000119,0x00000400,0x0004002b,0x00000039,0x0000011e,
|
||||
0x00000400,0x0006002c,0x0000003a,0x0000011f,0x0000011e,0x0000007e,0x0000007e,0x0004002b,
|
||||
0x00000008,0x00000120,0x00000004,0x0004002b,0x00000008,0x00000121,0x00000005,0x0004002b,
|
||||
0x00000008,0x00000122,0x00000006,0x0004002b,0x00000008,0x00000123,0x00000007,0x0004002b,
|
||||
0x00000008,0x00000124,0x00000008,0x0004002b,0x00000008,0x00000125,0x00000009,0x0004002b,
|
||||
0x00000008,0x00000126,0x0000000a,0x0004002b,0x00000008,0x00000127,0x0000000b,0x0004002b,
|
||||
0x00000008,0x00000128,0x0000000c,0x0004002b,0x00000008,0x00000129,0x0000000d,0x0004002b,
|
||||
0x00000008,0x0000012a,0x0000000e,0x0004002b,0x00000008,0x0000012b,0x0000000f,0x0004002b,
|
||||
0x00000008,0x0000012c,0x00000010,0x0004002b,0x00000008,0x0000012d,0x00000011,0x0004002b,
|
||||
0x00000008,0x0000012e,0x00000012,0x0004002b,0x00000008,0x0000012f,0x00000013,0x0004002b,
|
||||
0x00000008,0x00000130,0x00000014,0x00050036,0x00000002,0x00000004,0x00000000,0x00000003,
|
||||
0x000200f8,0x00000005,0x00040039,0x00000002,0x0000011c,0x00000006,0x000100fd,0x00010038,
|
||||
0x00050036,0x00000002,0x00000006,0x00000000,0x00000003,0x000200f8,0x00000007,0x0004003b,
|
||||
0x00000009,0x0000000a,0x00000007,0x0004003b,0x00000009,0x00000012,0x00000007,0x0004003b,
|
||||
0x00000009,0x0000001e,0x00000007,0x0004003b,0x00000009,0x00000025,0x00000007,0x0004003b,
|
||||
0x00000009,0x0000002a,0x00000007,0x0004003b,0x00000009,0x00000033,0x00000007,0x0004003b,
|
||||
0x00000009,0x00000038,0x00000007,0x0004003b,0x00000009,0x00000042,0x00000007,0x0004003b,
|
||||
0x00000009,0x00000043,0x00000007,0x0004003b,0x00000009,0x00000044,0x00000007,0x0004003b,
|
||||
0x00000009,0x00000045,0x00000007,0x0004003b,0x00000009,0x00000047,0x00000007,0x0004003b,
|
||||
0x00000009,0x00000052,0x00000007,0x0004003b,0x00000009,0x00000058,0x00000007,0x0004003b,
|
||||
0x00000009,0x0000007d,0x00000007,0x0004003b,0x00000009,0x00000082,0x00000007,0x0004003b,
|
||||
0x00000009,0x000000a1,0x00000007,0x00050041,0x0000000f,0x00000010,0x0000000d,0x0000000e,
|
||||
0x0004003d,0x00000008,0x00000011,0x00000010,0x0003003e,0x0000000a,0x00000011,0x0004003d,
|
||||
0x00000008,0x00000019,0x0000000a,0x00050084,0x00000008,0x0000001a,0x00000018,0x00000019,
|
||||
0x00050082,0x00000008,0x0000001b,0x0000001a,0x0000000e,0x00060041,0x0000000f,0x0000001c,
|
||||
0x00000016,0x00000017,0x0000001b,0x0004003d,0x00000008,0x0000001d,0x0000001c,0x0003003e,
|
||||
0x00000012,0x0000001d,0x0004003d,0x00000008,0x00000020,0x0000000a,0x00050084,0x00000008,
|
||||
0x00000021,0x0000001f,0x00000020,0x00050082,0x00000008,0x00000022,0x00000021,0x0000000e,
|
||||
0x00060041,0x0000000f,0x00000023,0x00000016,0x00000017,0x00000022,0x0004003d,0x00000008,
|
||||
0x00000024,0x00000023,0x0003003e,0x0000001e,0x00000024,0x0004003d,0x00000008,0x00000026,
|
||||
0x0000000a,0x00050082,0x00000008,0x00000027,0x00000026,0x0000000e,0x00060041,0x0000000f,
|
||||
0x00000028,0x00000016,0x00000017,0x00000027,0x0004003d,0x00000008,0x00000029,0x00000028,
|
||||
0x0003003e,0x00000025,0x00000029,0x0004003d,0x00000008,0x0000002f,0x0000000a,0x00050082,
|
||||
0x00000008,0x00000030,0x0000002f,0x0000000e,0x00060041,0x0000000f,0x00000031,0x0000002e,
|
||||
0x00000017,0x00000030,0x0004003d,0x00000008,0x00000032,0x00000031,0x0003003e,0x0000002a,
|
||||
0x00000032,0x0004003d,0x00000008,0x00000034,0x0000000a,0x00050082,0x00000008,0x00000035,
|
||||
0x00000034,0x00000018,0x00060041,0x0000000f,0x00000036,0x0000002e,0x00000017,0x00000035,
|
||||
0x0004003d,0x00000008,0x00000037,0x00000036,0x0003003e,0x00000033,0x00000037,0x00050041,
|
||||
0x0000003e,0x0000003f,0x0000003c,0x0000003d,0x0004003d,0x00000039,0x00000040,0x0000003f,
|
||||
0x0004007c,0x00000008,0x00000041,0x00000040,0x0003003e,0x00000038,0x00000041,0x0003003e,
|
||||
0x00000042,0x00000017,0x0003003e,0x00000043,0x00000017,0x0003003e,0x00000044,0x00000017,
|
||||
0x0004003d,0x00000008,0x00000046,0x00000038,0x0003003e,0x00000045,0x00000046,0x0004003d,
|
||||
0x00000008,0x00000048,0x0000000a,0x00050082,0x00000008,0x00000049,0x00000048,0x0000001f,
|
||||
0x0003003e,0x00000047,0x00000049,0x000200f9,0x0000004a,0x000200f8,0x0000004a,0x000400f6,
|
||||
0x0000004c,0x0000004d,0x00000000,0x000200f9,0x0000004e,0x000200f8,0x0000004e,0x0004003d,
|
||||
0x00000008,0x0000004f,0x00000047,0x000500af,0x00000050,0x00000051,0x0000004f,0x00000017,
|
||||
0x000400fa,0x00000051,0x0000004b,0x0000004c,0x000200f8,0x0000004b,0x0004003d,0x00000008,
|
||||
0x00000053,0x00000045,0x0004003d,0x00000008,0x00000054,0x00000047,0x00060041,0x0000000f,
|
||||
0x00000055,0x0000002e,0x00000017,0x00000054,0x0004003d,0x00000008,0x00000056,0x00000055,
|
||||
0x00050087,0x00000008,0x00000057,0x00000053,0x00000056,0x0003003e,0x00000052,0x00000057,
|
||||
0x0004003d,0x00000008,0x00000059,0x00000045,0x0004003d,0x00000008,0x0000005a,0x00000052,
|
||||
0x0004003d,0x00000008,0x0000005b,0x00000047,0x00060041,0x0000000f,0x0000005c,0x0000002e,
|
||||
0x00000017,0x0000005b,0x0004003d,0x00000008,0x0000005d,0x0000005c,0x00050084,0x00000008,
|
||||
0x0000005e,0x0000005a,0x0000005d,0x00050082,0x00000008,0x0000005f,0x00000059,0x0000005e,
|
||||
0x0003003e,0x00000058,0x0000005f,0x0004003d,0x00000008,0x00000060,0x00000058,0x0004003d,
|
||||
0x00000008,0x00000061,0x0000000a,0x0004003d,0x00000008,0x00000062,0x00000047,0x00050080,
|
||||
0x00000008,0x00000063,0x00000061,0x00000062,0x00060041,0x0000000f,0x00000064,0x00000016,
|
||||
0x00000017,0x00000063,0x0004003d,0x00000008,0x00000065,0x00000064,0x00050084,0x00000008,
|
||||
0x00000066,0x00000060,0x00000065,0x0004003d,0x00000008,0x00000067,0x00000042,0x00050080,
|
||||
0x00000008,0x00000068,0x00000067,0x00000066,0x0003003e,0x00000042,0x00000068,0x0004003d,
|
||||
0x00000008,0x00000069,0x00000058,0x0004003d,0x00000008,0x0000006a,0x0000000a,0x00050084,
|
||||
0x00000008,0x0000006b,0x00000018,0x0000006a,0x0004003d,0x00000008,0x0000006c,0x00000047,
|
||||
0x00050080,0x00000008,0x0000006d,0x0000006b,0x0000006c,0x00060041,0x0000000f,0x0000006e,
|
||||
0x00000016,0x00000017,0x0000006d,0x0004003d,0x00000008,0x0000006f,0x0000006e,0x00050084,
|
||||
0x00000008,0x00000070,0x00000069,0x0000006f,0x0004003d,0x00000008,0x00000071,0x00000043,
|
||||
0x00050080,0x00000008,0x00000072,0x00000071,0x00000070,0x0003003e,0x00000043,0x00000072,
|
||||
0x0004003d,0x00000008,0x00000073,0x00000058,0x0004003d,0x00000008,0x00000074,0x00000047,
|
||||
0x00060041,0x0000000f,0x00000075,0x00000016,0x00000017,0x00000074,0x0004003d,0x00000008,
|
||||
0x00000076,0x00000075,0x00050084,0x00000008,0x00000077,0x00000073,0x00000076,0x0004003d,
|
||||
0x00000008,0x00000078,0x00000044,0x00050080,0x00000008,0x00000079,0x00000078,0x00000077,
|
||||
0x0003003e,0x00000044,0x00000079,0x0004003d,0x00000008,0x0000007a,0x00000052,0x0003003e,
|
||||
0x00000045,0x0000007a,0x000200f9,0x0000004d,0x000200f8,0x0000004d,0x0004003d,0x00000008,
|
||||
0x0000007b,0x00000047,0x00050082,0x00000008,0x0000007c,0x0000007b,0x0000000e,0x0003003e,
|
||||
0x00000047,0x0000007c,0x000200f9,0x0000004a,0x000200f8,0x0000004c,0x00050041,0x0000003e,
|
||||
0x0000007f,0x0000003c,0x0000007e,0x0004003d,0x00000039,0x00000080,0x0000007f,0x0004007c,
|
||||
0x00000008,0x00000081,0x00000080,0x0003003e,0x0000007d,0x00000081,0x00050041,0x0000003e,
|
||||
0x00000084,0x00000083,0x0000003d,0x0004003d,0x00000039,0x00000085,0x00000084,0x0004007c,
|
||||
0x00000008,0x00000086,0x00000085,0x0003003e,0x00000082,0x00000086,0x0004003d,0x00000008,
|
||||
0x00000087,0x0000007d,0x0004003d,0x00000008,0x00000088,0x0000000a,0x00050084,0x00000008,
|
||||
0x00000089,0x00000018,0x00000088,0x00050082,0x00000008,0x0000008a,0x00000089,0x00000018,
|
||||
0x00060041,0x0000000f,0x0000008b,0x00000016,0x00000017,0x0000008a,0x0004003d,0x00000008,
|
||||
0x0000008c,0x0000008b,0x00050084,0x00000008,0x0000008d,0x00000087,0x0000008c,0x0004003d,
|
||||
0x00000008,0x0000008e,0x00000042,0x00050080,0x00000008,0x0000008f,0x0000008e,0x0000008d,
|
||||
0x0003003e,0x00000042,0x0000008f,0x0004003d,0x00000008,0x00000090,0x0000007d,0x0004003d,
|
||||
0x00000008,0x00000091,0x0000000a,0x00050084,0x00000008,0x00000092,0x0000001f,0x00000091,
|
||||
0x00050082,0x00000008,0x00000093,0x00000092,0x00000018,0x00060041,0x0000000f,0x00000094,
|
||||
0x00000016,0x00000017,0x00000093,0x0004003d,0x00000008,0x00000095,0x00000094,0x00050084,
|
||||
0x00000008,0x00000096,0x00000090,0x00000095,0x0004003d,0x00000008,0x00000097,0x00000043,
|
||||
0x00050080,0x00000008,0x00000098,0x00000097,0x00000096,0x0003003e,0x00000043,0x00000098,
|
||||
0x0004003d,0x00000008,0x00000099,0x0000007d,0x0004003d,0x00000008,0x0000009a,0x0000000a,
|
||||
0x00050082,0x00000008,0x0000009b,0x0000009a,0x00000018,0x00060041,0x0000000f,0x0000009c,
|
||||
0x00000016,0x00000017,0x0000009b,0x0004003d,0x00000008,0x0000009d,0x0000009c,0x00050084,
|
||||
0x00000008,0x0000009e,0x00000099,0x0000009d,0x0004003d,0x00000008,0x0000009f,0x00000044,
|
||||
0x00050080,0x00000008,0x000000a0,0x0000009f,0x0000009e,0x0003003e,0x00000044,0x000000a0,
|
||||
0x0004003d,0x00000008,0x000000a2,0x00000082,0x0003003e,0x000000a1,0x000000a2,0x000200f9,
|
||||
0x000000a3,0x000200f8,0x000000a3,0x000400f6,0x000000a5,0x000000a6,0x00000000,0x000200f9,
|
||||
0x000000a7,0x000200f8,0x000000a7,0x0004003d,0x00000008,0x000000a8,0x000000a1,0x0004003d,
|
||||
0x00000008,0x000000a9,0x0000002a,0x000500b1,0x00000050,0x000000aa,0x000000a8,0x000000a9,
|
||||
0x000400fa,0x000000aa,0x000000a4,0x000000a5,0x000200f8,0x000000a4,0x00050041,0x0000000f,
|
||||
0x000000ab,0x0000000d,0x00000017,0x0004003d,0x00000008,0x000000ac,0x000000ab,0x000300f7,
|
||||
0x000000b1,0x00000000,0x000b00fb,0x000000ac,0x000000b1,0x00000012,0x000000ad,0x00000010,
|
||||
0x000000ae,0x0000000f,0x000000af,0x00000013,0x000000b0,0x000200f8,0x000000ad,0x0004003d,
|
||||
0x00000008,0x000000b7,0x00000044,0x0004003d,0x00000008,0x000000b8,0x000000a1,0x0004003d,
|
||||
0x00000008,0x000000b9,0x00000025,0x00050084,0x00000008,0x000000ba,0x000000b8,0x000000b9,
|
||||
0x00050080,0x00000008,0x000000bb,0x000000b7,0x000000ba,0x0004003d,0x00000008,0x000000c0,
|
||||
0x00000042,0x0004003d,0x00000008,0x000000c1,0x000000a1,0x0004003d,0x00000008,0x000000c2,
|
||||
0x00000012,0x00050084,0x00000008,0x000000c3,0x000000c1,0x000000c2,0x00050080,0x00000008,
|
||||
0x000000c4,0x000000c0,0x000000c3,0x00060041,0x000000c5,0x000000c6,0x000000bf,0x00000017,
|
||||
0x000000c4,0x0004003d,0x000000b2,0x000000c7,0x000000c6,0x0004003d,0x00000008,0x000000cc,
|
||||
0x00000043,0x0004003d,0x00000008,0x000000cd,0x000000a1,0x0004003d,0x00000008,0x000000ce,
|
||||
0x0000001e,0x00050084,0x00000008,0x000000cf,0x000000cd,0x000000ce,0x00050080,0x00000008,
|
||||
0x000000d0,0x000000cc,0x000000cf,0x00060041,0x000000c5,0x000000d1,0x000000cb,0x00000017,
|
||||
0x000000d0,0x0004003d,0x000000b2,0x000000d2,0x000000d1,0x00050081,0x000000b2,0x000000d3,
|
||||
0x000000c7,0x000000d2,0x00060041,0x000000c5,0x000000d4,0x000000b6,0x00000017,0x000000bb,
|
||||
0x0003003e,0x000000d4,0x000000d3,0x000200f9,0x000000b1,0x000200f8,0x000000ae,0x0004003d,
|
||||
0x00000008,0x000000d6,0x00000044,0x0004003d,0x00000008,0x000000d7,0x000000a1,0x0004003d,
|
||||
0x00000008,0x000000d8,0x00000025,0x00050084,0x00000008,0x000000d9,0x000000d7,0x000000d8,
|
||||
0x00050080,0x00000008,0x000000da,0x000000d6,0x000000d9,0x0004003d,0x00000008,0x000000db,
|
||||
0x00000042,0x0004003d,0x00000008,0x000000dc,0x000000a1,0x0004003d,0x00000008,0x000000dd,
|
||||
0x00000012,0x00050084,0x00000008,0x000000de,0x000000dc,0x000000dd,0x00050080,0x00000008,
|
||||
0x000000df,0x000000db,0x000000de,0x00060041,0x000000c5,0x000000e0,0x000000bf,0x00000017,
|
||||
0x000000df,0x0004003d,0x000000b2,0x000000e1,0x000000e0,0x0004003d,0x00000008,0x000000e2,
|
||||
0x00000043,0x0004003d,0x00000008,0x000000e3,0x000000a1,0x0004003d,0x00000008,0x000000e4,
|
||||
0x0000001e,0x00050084,0x00000008,0x000000e5,0x000000e3,0x000000e4,0x00050080,0x00000008,
|
||||
0x000000e6,0x000000e2,0x000000e5,0x00060041,0x000000c5,0x000000e7,0x000000cb,0x00000017,
|
||||
0x000000e6,0x0004003d,0x000000b2,0x000000e8,0x000000e7,0x00050083,0x000000b2,0x000000e9,
|
||||
0x000000e1,0x000000e8,0x00060041,0x000000c5,0x000000ea,0x000000b6,0x00000017,0x000000da,
|
||||
0x0003003e,0x000000ea,0x000000e9,0x000200f9,0x000000b1,0x000200f8,0x000000af,0x0004003d,
|
||||
0x00000008,0x000000ec,0x00000044,0x0004003d,0x00000008,0x000000ed,0x000000a1,0x0004003d,
|
||||
0x00000008,0x000000ee,0x00000025,0x00050084,0x00000008,0x000000ef,0x000000ed,0x000000ee,
|
||||
0x00050080,0x00000008,0x000000f0,0x000000ec,0x000000ef,0x0004003d,0x00000008,0x000000f1,
|
||||
0x00000042,0x0004003d,0x00000008,0x000000f2,0x000000a1,0x0004003d,0x00000008,0x000000f3,
|
||||
0x00000012,0x00050084,0x00000008,0x000000f4,0x000000f2,0x000000f3,0x00050080,0x00000008,
|
||||
0x000000f5,0x000000f1,0x000000f4,0x00060041,0x000000c5,0x000000f6,0x000000bf,0x00000017,
|
||||
0x000000f5,0x0004003d,0x000000b2,0x000000f7,0x000000f6,0x0004003d,0x00000008,0x000000f8,
|
||||
0x00000043,0x0004003d,0x00000008,0x000000f9,0x000000a1,0x0004003d,0x00000008,0x000000fa,
|
||||
0x0000001e,0x00050084,0x00000008,0x000000fb,0x000000f9,0x000000fa,0x00050080,0x00000008,
|
||||
0x000000fc,0x000000f8,0x000000fb,0x00060041,0x000000c5,0x000000fd,0x000000cb,0x00000017,
|
||||
0x000000fc,0x0004003d,0x000000b2,0x000000fe,0x000000fd,0x00050085,0x000000b2,0x000000ff,
|
||||
0x000000f7,0x000000fe,0x00060041,0x000000c5,0x00000100,0x000000b6,0x00000017,0x000000f0,
|
||||
0x0003003e,0x00000100,0x000000ff,0x000200f9,0x000000b1,0x000200f8,0x000000b0,0x0004003d,
|
||||
0x00000008,0x00000102,0x00000044,0x0004003d,0x00000008,0x00000103,0x000000a1,0x0004003d,
|
||||
0x00000008,0x00000104,0x00000025,0x00050084,0x00000008,0x00000105,0x00000103,0x00000104,
|
||||
0x00050080,0x00000008,0x00000106,0x00000102,0x00000105,0x0004003d,0x00000008,0x00000107,
|
||||
0x00000042,0x0004003d,0x00000008,0x00000108,0x000000a1,0x0004003d,0x00000008,0x00000109,
|
||||
0x00000012,0x00050084,0x00000008,0x0000010a,0x00000108,0x00000109,0x00050080,0x00000008,
|
||||
0x0000010b,0x00000107,0x0000010a,0x00060041,0x000000c5,0x0000010c,0x000000bf,0x00000017,
|
||||
0x0000010b,0x0004003d,0x000000b2,0x0000010d,0x0000010c,0x0004003d,0x00000008,0x0000010e,
|
||||
0x00000043,0x0004003d,0x00000008,0x0000010f,0x000000a1,0x0004003d,0x00000008,0x00000110,
|
||||
0x0000001e,0x00050084,0x00000008,0x00000111,0x0000010f,0x00000110,0x00050080,0x00000008,
|
||||
0x00000112,0x0000010e,0x00000111,0x00060041,0x000000c5,0x00000113,0x000000cb,0x00000017,
|
||||
0x00000112,0x0004003d,0x000000b2,0x00000114,0x00000113,0x00050088,0x000000b2,0x00000115,
|
||||
0x0000010d,0x00000114,0x00060041,0x000000c5,0x00000116,0x000000b6,0x00000017,0x00000106,
|
||||
0x0003003e,0x00000116,0x00000115,0x000200f9,0x000000b1,0x000200f8,0x000000b1,0x000200f9,
|
||||
0x000000a6,0x000200f8,0x000000a6,0x0004003d,0x00000008,0x0000011a,0x000000a1,0x00050080,
|
||||
0x00000008,0x0000011b,0x0000011a,0x00000119,0x0003003e,0x000000a1,0x0000011b,0x000200f9,
|
||||
0x000000a3,0x000200f8,0x000000a5,0x000100fd,0x00010038
|
||||
};
|
||||
|
||||
}}} // namespace cv::dnn::vkcom
|
@ -12,10 +12,11 @@ std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
|
||||
void initSPVMaps()
|
||||
{
|
||||
SPVMaps.insert(std::make_pair("conv_1x1_fast_spv", std::make_pair(conv_1x1_fast_spv, 3134)));
|
||||
SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
|
||||
SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
|
||||
SPVMaps.insert(std::make_pair("conv_depthwise_3x3_spv", std::make_pair(conv_depthwise_3x3_spv, 1977)));
|
||||
SPVMaps.insert(std::make_pair("conv_implicit_gemm_spv", std::make_pair(conv_implicit_gemm_spv, 3565)));
|
||||
SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
|
||||
SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
|
||||
SPVMaps.insert(std::make_pair("nary_eltwise_binary_forward_spv", std::make_pair(nary_eltwise_binary_forward_spv, 1757)));
|
||||
}
|
||||
|
||||
}}} // namespace cv::dnn::vkcom
|
||||
|
@ -9,10 +9,11 @@
|
||||
namespace cv { namespace dnn { namespace vkcom {
|
||||
|
||||
extern const unsigned int conv_1x1_fast_spv[3134];
|
||||
extern const unsigned int gemm_spv[2902];
|
||||
extern const unsigned int conv_depthwise_spv[2092];
|
||||
extern const unsigned int conv_depthwise_3x3_spv[1977];
|
||||
extern const unsigned int conv_implicit_gemm_spv[3565];
|
||||
extern const unsigned int conv_depthwise_spv[2092];
|
||||
extern const unsigned int gemm_spv[2902];
|
||||
extern const unsigned int nary_eltwise_binary_forward_spv[1757];
|
||||
|
||||
extern std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
|
||||
|
||||
|
197
modules/dnn/src/vkcom/src/op_naryEltwise.cpp
Normal file
197
modules/dnn/src/vkcom/src/op_naryEltwise.cpp
Normal file
@ -0,0 +1,197 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../../precomp.hpp"
|
||||
#include "internal.hpp"
|
||||
#include "../include/op_naryeltwise.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace vkcom {
|
||||
|
||||
#ifdef HAVE_VULKAN
|
||||
|
||||
#define STEP_SIZE 65536
|
||||
|
||||
#define MAX_GROUP_COUNT_X 65535
|
||||
#define MAX_GROUP_COUNT_Y 65535
|
||||
#define MAX_GROUP_COUNT_Z 65535
|
||||
|
||||
OpNary::OpNary(const OpNary::OPERATION _naryOpType, int _ninputs, int _max_ndims,
|
||||
const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps)
|
||||
: naryOpType(_naryOpType), ninputs(_ninputs), max_ndims(_max_ndims)
|
||||
{
|
||||
CV_Assert(ninputs > 1);
|
||||
|
||||
shapesBuf.resize((ninputs + 1) * max_ndims);
|
||||
stepsBuf.resize((ninputs + 1) * max_ndims);
|
||||
for (int i = 0; i <= ninputs; i++)
|
||||
{
|
||||
std::copy(shapes[i].begin(), shapes[i].end(), shapesBuf.data() + i * max_ndims);
|
||||
std::copy(steps[i].begin(), steps[i].end(), stepsBuf.data() + i * max_ndims);
|
||||
}
|
||||
|
||||
// TODO(VK): support more types of operation
|
||||
switch(naryOpType) {
|
||||
// case OPERATION::EQUAL:
|
||||
// case OPERATION::GREATER:
|
||||
// case OPERATION::GREATER_EQUAL:
|
||||
// case OPERATION::LESS:
|
||||
// case OPERATION::LESS_EQUAL:
|
||||
// case OPERATION::POW:
|
||||
// case OPERATION::BITSHIFT:
|
||||
// case OPERATION::MOD:
|
||||
case OPERATION::PROD:
|
||||
case OPERATION::SUB:
|
||||
case OPERATION::ADD:
|
||||
case OPERATION::DIV:
|
||||
// case OPERATION::AND:
|
||||
// case OPERATION::OR:
|
||||
// case OPERATION::XOR:
|
||||
{
|
||||
CV_Assert(ninputs == 2);
|
||||
CV_Assert(max_ndims >= 2);
|
||||
shaderType = kNaryShaderTypeBinary;
|
||||
shader_name = "nary_eltwise_binary_forward_spv";
|
||||
|
||||
// TODO(VK): confirm if this makes any sense
|
||||
nplanes = std::accumulate(shapesBuf.data(), shapesBuf.data() + max_ndims - 2, 1, [](int32_t a, int32_t b) { return a * b; } );
|
||||
N2 = shapesBuf.data()[max_ndims - 2];
|
||||
N1 = shapesBuf.data()[max_ndims - 1];
|
||||
CV_LOG_DEBUG(NULL, "max_ndims="<<max_ndims<<", nplanes="<<nplanes<<", N2="<<N2<<", N1="<<N1);
|
||||
break;
|
||||
}
|
||||
case OPERATION::WHERE:
|
||||
{
|
||||
CV_Assert(ninputs == 3);
|
||||
CV_Assert(max_ndims >= 2);
|
||||
shaderType = kNaryShaderTypeTrinary;
|
||||
shader_name = "nary_eltwise_trinary_forward_spv";
|
||||
break;
|
||||
}
|
||||
// case OPERATION::MAX:
|
||||
// case OPERATION::MEAN:
|
||||
// case OPERATION::MIN:
|
||||
case OPERATION::SUM:
|
||||
{
|
||||
CV_Assert(max_ndims >= 2);
|
||||
shaderType = kNaryShaderTypeNary;
|
||||
shader_name = "nary_eltwise_nary_forward_spv";
|
||||
break;
|
||||
}
|
||||
//TODO(VK) add other cases
|
||||
default:
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported nary operation type");
|
||||
}
|
||||
// TODO(VK): initialize OpNary class
|
||||
}
|
||||
|
||||
void OpNary::firstForward()
|
||||
{
|
||||
if (!firstForwardFinsh)
|
||||
{
|
||||
config.local_size_x = 1; // TODO(vk) determine local_size_y if necessary
|
||||
config.local_size_y = 1; // TODO(vk) determine local_size_y if necessary
|
||||
config.local_size_z = 1; // TODO(vk) determine local_size_z if necessary
|
||||
computeGroupCount();
|
||||
firstForwardFinsh = true;
|
||||
}
|
||||
else
|
||||
return;
|
||||
}
|
||||
|
||||
bool OpNary::binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
|
||||
{
|
||||
std::vector<int32_t> param = {(int32_t)naryOpType, max_ndims};
|
||||
std::vector<int32_t> paramSize = {(int32_t)param.size()};
|
||||
std::vector<int32_t> dimSizes = {(ninputs + 1) * max_ndims};
|
||||
std::vector<int32_t> actualSteps;
|
||||
|
||||
// TODO(VK): compute step for different dtype. Currently this is for kFormatFp32.
|
||||
actualSteps.resize(stepsBuf.size());
|
||||
std::transform(stepsBuf.data(), stepsBuf.data() + dimSizes[0], actualSteps.begin(), [](int32_t sz){ return sz / 4; });
|
||||
|
||||
Tensor paramTensor = Tensor(reinterpret_cast<const char *>(param.data()), paramSize, kFormatInt32, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
|
||||
Tensor shapeTensor = Tensor(reinterpret_cast<const char *>(shapesBuf.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
|
||||
Tensor stepTensor = Tensor(reinterpret_cast<const char *>(actualSteps.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
|
||||
|
||||
destTypes = {
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input1
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input2
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // out
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, // param
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // shape
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // step
|
||||
};
|
||||
|
||||
|
||||
Ptr<Pipeline> pipeline = pipelineFactoryPtr->getPipeline(shader_name, destTypes);
|
||||
Ptr<CommandBuffer> cmdBuffer = cmdPoolPtr->allocBuffer();
|
||||
Ptr<Descriptor> desSet = pipeline->createSet();
|
||||
VkCommandBuffer cmdBufferReal = cmdBuffer->get();
|
||||
|
||||
desSet->writeTensor(ins[0], 0);
|
||||
desSet->writeTensor(ins[1], 1);
|
||||
desSet->writeTensor(outs[0], 2);
|
||||
desSet->writeTensor(paramTensor, 3);
|
||||
desSet->writeTensor(shapeTensor, 4);
|
||||
desSet->writeTensor(stepTensor, 5);
|
||||
|
||||
cmdBuffer->beginRecord();
|
||||
pipeline->bind(cmdBufferReal, desSet->get());
|
||||
vkCmdDispatch(cmdBufferReal, group_x_, group_y_, group_z_);
|
||||
cmdBuffer->endRecord();
|
||||
cmdPoolPtr->submitAndWait(cmdBufferReal);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpNary::forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
|
||||
{
|
||||
|
||||
firstForward();
|
||||
|
||||
// TODO(VK): Support more dtypes. Currently only kFormatFp32 is supported.
|
||||
for (auto &tensor: ins)
|
||||
{
|
||||
CV_Assert(tensor.getFormat() == kFormatFp32);
|
||||
}
|
||||
for (auto &tensor: outs)
|
||||
{
|
||||
CV_Assert(tensor.getFormat() == kFormatFp32);
|
||||
}
|
||||
|
||||
switch(shaderType) {
|
||||
case kNaryShaderTypeBinary: {
|
||||
return binaryForward(ins, outs);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported shader type invoked.");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpNary::computeGroupCount()
|
||||
{
|
||||
if (shaderType == kNaryShaderTypeBinary)
|
||||
{
|
||||
group_x_ = nplanes; // parallelism at plane level
|
||||
group_y_ = N2;
|
||||
group_z_ = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Error(CV_StsNotImplemented, "shader type is not supported at compute GroupCount.");
|
||||
}
|
||||
|
||||
CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
|
||||
CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
|
||||
CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // HAVE_VULKAN
|
||||
|
||||
}}} // namespace cv::dnn::vkcom
|
@ -1613,7 +1613,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
if (inputs_arr.depth() == CV_16F)
|
||||
{
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
|
@ -79,6 +79,7 @@ set(gapi_srcs
|
||||
src/api/gframe.cpp
|
||||
src/api/gkernel.cpp
|
||||
src/api/gbackend.cpp
|
||||
src/api/gcommon.cpp
|
||||
src/api/gproto.cpp
|
||||
src/api/gnode.cpp
|
||||
src/api/gcall.cpp
|
||||
@ -121,8 +122,10 @@ set(gapi_srcs
|
||||
src/executor/gabstractstreamingexecutor.cpp
|
||||
src/executor/gexecutor.cpp
|
||||
src/executor/gtbbexecutor.cpp
|
||||
src/executor/gthreadedexecutor.cpp
|
||||
src/executor/gstreamingexecutor.cpp
|
||||
src/executor/gasync.cpp
|
||||
src/executor/thread_pool.cpp
|
||||
|
||||
# CPU Backend (currently built-in)
|
||||
src/backends/cpu/gcpubackend.cpp
|
||||
|
@ -263,12 +263,32 @@ struct graph_dump_path
|
||||
};
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* @brief Ask G-API to use threaded executor when cv::GComputation
|
||||
* is compiled via cv::GComputation::compile method.
|
||||
*
|
||||
* Specifies a number of threads that should be used by executor.
|
||||
*/
|
||||
struct GAPI_EXPORTS use_threaded_executor
|
||||
{
|
||||
use_threaded_executor();
|
||||
explicit use_threaded_executor(const uint32_t nthreads);
|
||||
|
||||
uint32_t num_threads;
|
||||
};
|
||||
/** @} */
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template<> struct CompileArgTag<cv::graph_dump_path>
|
||||
{
|
||||
static const char* tag() { return "gapi.graph_dump_path"; }
|
||||
};
|
||||
|
||||
template<> struct CompileArgTag<cv::use_threaded_executor>
|
||||
{
|
||||
static const char* tag() { return "gapi.threaded_executor"; }
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace cv
|
||||
|
18
modules/gapi/src/api/gcommon.cpp
Normal file
18
modules/gapi/src/api/gcommon.cpp
Normal file
@ -0,0 +1,18 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#include <opencv2/gapi/gcommon.hpp>
|
||||
#include <opencv2/core/utility.hpp>
|
||||
|
||||
cv::use_threaded_executor::use_threaded_executor()
|
||||
: num_threads(cv::getNumThreads()) {
|
||||
}
|
||||
|
||||
cv::use_threaded_executor::use_threaded_executor(const uint32_t nthreads)
|
||||
: num_threads(nthreads) {
|
||||
}
|
@ -33,6 +33,7 @@
|
||||
#include "compiler/passes/pattern_matching.hpp"
|
||||
|
||||
#include "executor/gexecutor.hpp"
|
||||
#include "executor/gthreadedexecutor.hpp"
|
||||
#include "executor/gstreamingexecutor.hpp"
|
||||
#include "backends/common/gbackend.hpp"
|
||||
#include "backends/common/gmetabackend.hpp"
|
||||
@ -452,8 +453,16 @@ cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
|
||||
.get<OutputMeta>().outMeta;
|
||||
// FIXME: select which executor will be actually used,
|
||||
// make GExecutor abstract.
|
||||
std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));
|
||||
|
||||
auto use_threaded_exec = cv::gapi::getCompileArg<cv::use_threaded_executor>(m_args);
|
||||
std::unique_ptr<GAbstractExecutor> pE;
|
||||
if (use_threaded_exec) {
|
||||
const auto num_threads = use_threaded_exec.value().num_threads;
|
||||
GAPI_LOG_INFO(NULL, "Threaded executor with " << num_threads << " thread(s) will be used");
|
||||
pE.reset(new GThreadedExecutor(num_threads, std::move(pg)));
|
||||
} else {
|
||||
pE.reset(new GExecutor(std::move(pg)));
|
||||
}
|
||||
GCompiled compiled;
|
||||
compiled.priv().setup(m_metas, outMetas, std::move(pE));
|
||||
|
||||
|
511
modules/gapi/src/executor/gthreadedexecutor.cpp
Normal file
511
modules/gapi/src/executor/gthreadedexecutor.cpp
Normal file
@ -0,0 +1,511 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#include <ade/util/zip_range.hpp>
|
||||
|
||||
#include <opencv2/gapi/opencv_includes.hpp>
|
||||
|
||||
#include "api/gproto_priv.hpp" // ptr(GRunArgP)
|
||||
#include "executor/gthreadedexecutor.hpp"
|
||||
#include "compiler/passes/passes.hpp"
|
||||
|
||||
namespace cv {
|
||||
namespace gimpl {
|
||||
namespace magazine {
|
||||
namespace {
|
||||
|
||||
void bindInArgExec(Mag& mag, const RcDesc &rc, const GRunArg &arg) {
|
||||
if (rc.shape != GShape::GMAT) {
|
||||
bindInArg(mag, rc, arg);
|
||||
return;
|
||||
}
|
||||
auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
|
||||
switch (arg.index()) {
|
||||
case GRunArg::index_of<Mat>() :
|
||||
mag_rmat = make_rmat<RMatOnMat>(util::get<Mat>(arg));
|
||||
break;
|
||||
case GRunArg::index_of<cv::RMat>() :
|
||||
mag_rmat = util::get<cv::RMat>(arg);
|
||||
break;
|
||||
default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
|
||||
}
|
||||
// FIXME: has to take extra care about meta here for this particuluar
|
||||
// case, just because this function exists at all
|
||||
mag.meta<cv::RMat>()[rc.id] = arg.meta;
|
||||
}
|
||||
|
||||
void bindOutArgExec(Mag& mag, const RcDesc &rc, const GRunArgP &arg) {
|
||||
if (rc.shape != GShape::GMAT) {
|
||||
bindOutArg(mag, rc, arg);
|
||||
return;
|
||||
}
|
||||
auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
|
||||
switch (arg.index()) {
|
||||
case GRunArgP::index_of<Mat*>() :
|
||||
mag_rmat = make_rmat<RMatOnMat>(*util::get<Mat*>(arg)); break;
|
||||
case GRunArgP::index_of<cv::RMat*>() :
|
||||
mag_rmat = *util::get<cv::RMat*>(arg); break;
|
||||
default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
|
||||
}
|
||||
}
|
||||
|
||||
cv::GRunArgP getObjPtrExec(Mag& mag, const RcDesc &rc) {
|
||||
if (rc.shape != GShape::GMAT) {
|
||||
return getObjPtr(mag, rc);
|
||||
}
|
||||
return GRunArgP(&mag.slot<cv::RMat>()[rc.id]);
|
||||
}
|
||||
|
||||
void writeBackExec(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg) {
|
||||
if (rc.shape != GShape::GMAT) {
|
||||
writeBack(mag, rc, g_arg);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (g_arg.index()) {
|
||||
case GRunArgP::index_of<cv::Mat*>() : {
|
||||
// If there is a copy intrinsic at the end of the graph
|
||||
// we need to actually copy the data to the user buffer
|
||||
// since output runarg was optimized to simply point
|
||||
// to the input of the copy kernel
|
||||
// FIXME:
|
||||
// Rework, find a better way to check if there should be
|
||||
// a real copy (add a pass to StreamingBackend?)
|
||||
// NB: In case RMat adapter not equal to "RMatOnMat" need to
|
||||
// copy data back to the host as well.
|
||||
auto& out_mat = *util::get<cv::Mat*>(g_arg);
|
||||
const auto& rmat = mag.template slot<cv::RMat>().at(rc.id);
|
||||
auto* adapter = rmat.get<RMatOnMat>();
|
||||
if ((adapter != nullptr && out_mat.data != adapter->data()) ||
|
||||
(adapter == nullptr)) {
|
||||
auto view = rmat.access(RMat::Access::R);
|
||||
asMat(view).copyTo(out_mat);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GRunArgP::index_of<cv::RMat*>() : /* do nothing */ break;
|
||||
default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
|
||||
}
|
||||
}
|
||||
|
||||
void assignMetaStubExec(Mag& mag, const RcDesc &rc, const cv::GRunArg::Meta &meta) {
|
||||
switch (rc.shape) {
|
||||
case GShape::GARRAY: mag.meta<cv::detail::VectorRef>()[rc.id] = meta; break;
|
||||
case GShape::GOPAQUE: mag.meta<cv::detail::OpaqueRef>()[rc.id] = meta; break;
|
||||
case GShape::GSCALAR: mag.meta<cv::Scalar>()[rc.id] = meta; break;
|
||||
case GShape::GFRAME: mag.meta<cv::MediaFrame>()[rc.id] = meta; break;
|
||||
case GShape::GMAT:
|
||||
mag.meta<cv::Mat>() [rc.id] = meta;
|
||||
mag.meta<cv::RMat>()[rc.id] = meta;
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
mag.meta<cv::UMat>()[rc.id] = meta;
|
||||
#endif
|
||||
break;
|
||||
default: util::throw_error(std::logic_error("Unsupported GShape type")); break;
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
}}} // namespace cv::gimpl::magazine
|
||||
|
||||
cv::gimpl::StreamMsg cv::gimpl::GThreadedExecutor::Input::get() {
|
||||
std::lock_guard<std::mutex> lock{m_state.m};
|
||||
cv::GRunArgs res;
|
||||
for (const auto &rc : desc()) { res.emplace_back(magazine::getArg(m_state.mag, rc)); }
|
||||
return cv::gimpl::StreamMsg{std::move(res)};
|
||||
}
|
||||
|
||||
cv::gimpl::GThreadedExecutor::Input::Input(cv::gimpl::GraphState &state,
|
||||
const std::vector<RcDesc> &rcs)
|
||||
: m_state(state) {
|
||||
set(rcs);
|
||||
};
|
||||
|
||||
cv::GRunArgP cv::gimpl::GThreadedExecutor::Output::get(int idx) {
|
||||
std::lock_guard<std::mutex> lock{m_state.m};
|
||||
auto r = magazine::getObjPtrExec(m_state.mag, desc()[idx]);
|
||||
// Remember the output port for this output object
|
||||
m_out_idx[cv::gimpl::proto::ptr(r)] = idx;
|
||||
return r;
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::Output::post(cv::GRunArgP&&, const std::exception_ptr& e) {
|
||||
if (e) {
|
||||
m_eptr = e;
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::Output::post(Exception&& ex) {
|
||||
m_eptr = std::move(ex.eptr);
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::Output::meta(const GRunArgP &out, const GRunArg::Meta &m) {
|
||||
const auto idx = m_out_idx.at(cv::gimpl::proto::ptr(out));
|
||||
std::lock_guard<std::mutex> lock{m_state.m};
|
||||
magazine::assignMetaStubExec(m_state.mag, desc()[idx], m);
|
||||
}
|
||||
|
||||
cv::gimpl::GThreadedExecutor::Output::Output(cv::gimpl::GraphState &state,
|
||||
const std::vector<RcDesc> &rcs)
|
||||
: m_state(state) {
|
||||
set(rcs);
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::Output::verify() {
|
||||
if (m_eptr) {
|
||||
std::rethrow_exception(m_eptr);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh) {
|
||||
const Data &d = m_gm.metadata(orig_nh).get<Data>();
|
||||
|
||||
if ( d.storage != Data::Storage::INTERNAL
|
||||
&& d.storage != Data::Storage::CONST_VAL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// INTERNALS+CONST only! no need to allocate/reset output objects
|
||||
// to as it is bound externally (e.g. already in the m_state.mag)
|
||||
|
||||
switch (d.shape) {
|
||||
case GShape::GMAT: {
|
||||
// Let island allocate it's outputs if it can,
|
||||
// allocate cv::Mat and wrap it with RMat otherwise
|
||||
GAPI_Assert(!nh->inNodes().empty());
|
||||
const auto desc = util::get<cv::GMatDesc>(d.meta);
|
||||
auto& exec = m_gim.metadata(nh->inNodes().front()).get<IslandExec>().object;
|
||||
auto& rmat = m_state.mag.slot<cv::RMat>()[d.rc];
|
||||
if (exec->allocatesOutputs()) {
|
||||
rmat = exec->allocate(desc);
|
||||
} else {
|
||||
Mat mat;
|
||||
createMat(desc, mat);
|
||||
rmat = make_rmat<RMatOnMat>(mat);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case GShape::GSCALAR:
|
||||
if (d.storage == Data::Storage::CONST_VAL) {
|
||||
auto rc = RcDesc{d.rc, d.shape, d.ctor};
|
||||
magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
|
||||
}
|
||||
break;
|
||||
|
||||
case GShape::GARRAY:
|
||||
if (d.storage == Data::Storage::CONST_VAL) {
|
||||
auto rc = RcDesc{d.rc, d.shape, d.ctor};
|
||||
magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
|
||||
}
|
||||
break;
|
||||
case GShape::GOPAQUE:
|
||||
// Constructed on Reset, do nothing here
|
||||
break;
|
||||
case GShape::GFRAME: {
|
||||
// Should be defined by backend, do nothing here
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GAPI_Error("InternalError");
|
||||
}
|
||||
}
|
||||
|
||||
cv::gimpl::IslandActor::IslandActor(const std::vector<RcDesc> &in_objects,
|
||||
const std::vector<RcDesc> &out_objects,
|
||||
std::shared_ptr<GIslandExecutable> isl_exec,
|
||||
cv::gimpl::GraphState &state)
|
||||
: m_isl_exec(isl_exec),
|
||||
m_inputs(state, in_objects),
|
||||
m_outputs(state, out_objects) {
|
||||
}
|
||||
|
||||
void cv::gimpl::IslandActor::run() {
|
||||
m_isl_exec->run(m_inputs, m_outputs);
|
||||
}
|
||||
|
||||
void cv::gimpl::IslandActor::verify() {
|
||||
m_outputs.verify();
|
||||
};
|
||||
|
||||
class cv::gimpl::Task {
|
||||
friend class TaskManager;
|
||||
public:
|
||||
using Ptr = std::shared_ptr<Task>;
|
||||
Task(TaskManager::F&& f, std::vector<Task::Ptr> &&producers);
|
||||
|
||||
struct ExecutionState {
|
||||
cv::gapi::own::ThreadPool& tp;
|
||||
cv::gapi::own::Latch& latch;
|
||||
};
|
||||
|
||||
void run(ExecutionState& state);
|
||||
bool isLast() const { return m_consumers.empty(); }
|
||||
void reset() { m_ready_producers.store(0u); }
|
||||
|
||||
private:
|
||||
TaskManager::F m_f;
|
||||
const uint32_t m_num_producers;
|
||||
std::atomic<uint32_t> m_ready_producers;
|
||||
std::vector<Task*> m_consumers;
|
||||
};
|
||||
|
||||
cv::gimpl::Task::Task(TaskManager::F &&f,
|
||||
std::vector<Task::Ptr> &&producers)
|
||||
: m_f(std::move(f)),
|
||||
m_num_producers(static_cast<uint32_t>(producers.size())) {
|
||||
for (auto producer : producers) {
|
||||
producer->m_consumers.push_back(this);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gimpl::Task::run(ExecutionState& state) {
|
||||
// Execute the task
|
||||
m_f();
|
||||
// Notify every consumer about completion one of its dependencies
|
||||
for (auto* consumer : m_consumers) {
|
||||
const auto num_ready =
|
||||
consumer->m_ready_producers.fetch_add(1, std::memory_order_relaxed) + 1;
|
||||
// The last completed producer schedule the consumer for execution
|
||||
if (num_ready == consumer->m_num_producers) {
|
||||
state.tp.schedule([&state, consumer](){
|
||||
consumer->run(state);
|
||||
});
|
||||
}
|
||||
}
|
||||
// If tasks has no consumers this is the last task
|
||||
// Execution lasts until all last tasks are completed
|
||||
// Decrement the latch to notify about completion
|
||||
if (isLast()) {
|
||||
state.latch.count_down();
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<cv::gimpl::Task>
|
||||
cv::gimpl::TaskManager::createTask(cv::gimpl::TaskManager::F &&f,
|
||||
std::vector<std::shared_ptr<cv::gimpl::Task>> &&producers) {
|
||||
const bool is_initial = producers.empty();
|
||||
auto task = std::make_shared<cv::gimpl::Task>(std::move(f),
|
||||
std::move(producers));
|
||||
m_all_tasks.emplace_back(task);
|
||||
if (is_initial) {
|
||||
m_initial_tasks.emplace_back(task);
|
||||
}
|
||||
return task;
|
||||
}
|
||||
|
||||
void cv::gimpl::TaskManager::scheduleAndWait(cv::gapi::own::ThreadPool& tp) {
|
||||
// Reset the number of ready dependencies for all tasks
|
||||
for (auto& task : m_all_tasks) { task->reset(); }
|
||||
|
||||
// Count the number of last tasks
|
||||
auto isLast = [](const std::shared_ptr<Task>& task) { return task->isLast(); };
|
||||
const auto kNumLastsTasks =
|
||||
std::count_if(m_all_tasks.begin(), m_all_tasks.end(), isLast);
|
||||
|
||||
// Initialize the latch, schedule initial tasks
|
||||
// and wait until all lasts tasks are done
|
||||
cv::gapi::own::Latch latch(kNumLastsTasks);
|
||||
Task::ExecutionState state{tp, latch};
|
||||
for (auto task : m_initial_tasks) {
|
||||
state.tp.schedule([&state, task](){ task->run(state); });
|
||||
}
|
||||
latch.wait();
|
||||
}
|
||||
|
||||
cv::gimpl::GThreadedExecutor::GThreadedExecutor(const uint32_t num_threads,
|
||||
std::unique_ptr<ade::Graph> &&g_model)
|
||||
: GAbstractExecutor(std::move(g_model)),
|
||||
m_thread_pool(num_threads) {
|
||||
auto sorted = m_gim.metadata().get<ade::passes::TopologicalSortData>();
|
||||
|
||||
std::unordered_map< ade::NodeHandle
|
||||
, std::shared_ptr<Task>
|
||||
, ade::HandleHasher<ade::Node>> m_tasks_map;
|
||||
for (auto nh : sorted.nodes())
|
||||
{
|
||||
switch (m_gim.metadata(nh).get<NodeKind>().k)
|
||||
{
|
||||
case NodeKind::ISLAND:
|
||||
{
|
||||
std::vector<RcDesc> input_rcs;
|
||||
std::vector<RcDesc> output_rcs;
|
||||
input_rcs.reserve(nh->inNodes().size());
|
||||
output_rcs.reserve(nh->outNodes().size());
|
||||
|
||||
auto xtract = [&](ade::NodeHandle slot_nh, std::vector<RcDesc> &vec) {
|
||||
const auto orig_data_nh
|
||||
= m_gim.metadata(slot_nh).get<DataSlot>().original_data_node;
|
||||
const auto &orig_data_info
|
||||
= m_gm.metadata(orig_data_nh).get<Data>();
|
||||
vec.emplace_back(RcDesc{ orig_data_info.rc
|
||||
, orig_data_info.shape
|
||||
, orig_data_info.ctor});
|
||||
};
|
||||
for (auto in_slot_nh : nh->inNodes()) xtract(in_slot_nh, input_rcs);
|
||||
for (auto out_slot_nh : nh->outNodes()) xtract(out_slot_nh, output_rcs);
|
||||
|
||||
auto actor = std::make_shared<IslandActor>(std::move(input_rcs),
|
||||
std::move(output_rcs),
|
||||
m_gim.metadata(nh).get<IslandExec>().object,
|
||||
m_state);
|
||||
m_actors.push_back(actor);
|
||||
|
||||
std::unordered_set<ade::NodeHandle, ade::HandleHasher<ade::Node>> producer_nhs;
|
||||
for (auto slot_nh : nh->inNodes()) {
|
||||
for (auto island_nh : slot_nh->inNodes()) {
|
||||
GAPI_Assert(m_gim.metadata(island_nh).get<NodeKind>().k == NodeKind::ISLAND);
|
||||
producer_nhs.emplace(island_nh);
|
||||
}
|
||||
}
|
||||
std::vector<std::shared_ptr<Task>> producers;
|
||||
producers.reserve(producer_nhs.size());
|
||||
for (auto producer_nh : producer_nhs) {
|
||||
producers.push_back(m_tasks_map.at(producer_nh));
|
||||
}
|
||||
auto task = m_task_manager.createTask(
|
||||
[actor](){actor->run();}, std::move(producers));
|
||||
m_tasks_map.emplace(nh, task);
|
||||
}
|
||||
break;
|
||||
|
||||
case NodeKind::SLOT:
|
||||
{
|
||||
const auto orig_data_nh
|
||||
= m_gim.metadata(nh).get<DataSlot>().original_data_node;
|
||||
initResource(nh, orig_data_nh);
|
||||
m_slots.emplace_back(DataDesc{nh, orig_data_nh});
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
GAPI_Error("InternalError");
|
||||
break;
|
||||
} // switch(kind)
|
||||
} // for(gim nodes)
|
||||
|
||||
prepareForNewStream();
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::run(cv::gimpl::GRuntimeArgs &&args) {
|
||||
const auto proto = m_gm.metadata().get<Protocol>();
|
||||
|
||||
// Basic check if input/output arguments are correct
|
||||
// FIXME: Move to GCompiled (do once for all GExecutors)
|
||||
if (proto.inputs.size() != args.inObjs.size()) { // TODO: Also check types
|
||||
util::throw_error(std::logic_error
|
||||
("Computation's input protocol doesn\'t "
|
||||
"match actual arguments!"));
|
||||
}
|
||||
if (proto.outputs.size() != args.outObjs.size()) { // TODO: Also check types
|
||||
util::throw_error(std::logic_error
|
||||
("Computation's output protocol doesn\'t "
|
||||
"match actual arguments!"));
|
||||
}
|
||||
|
||||
namespace util = ade::util;
|
||||
|
||||
// ensure that output Mat parameters are correctly allocated
|
||||
// FIXME: avoid copy of NodeHandle and GRunRsltComp ?
|
||||
for (auto index : util::iota(proto.out_nhs.size())) {
|
||||
auto& nh = proto.out_nhs.at(index);
|
||||
const Data &d = m_gm.metadata(nh).get<Data>();
|
||||
if (d.shape == GShape::GMAT) {
|
||||
using cv::util::get;
|
||||
const auto desc = get<cv::GMatDesc>(d.meta);
|
||||
|
||||
auto check_rmat = [&desc, &args, &index]() {
|
||||
auto& out_mat = *get<cv::RMat*>(args.outObjs.at(index));
|
||||
GAPI_Assert(desc.canDescribe(out_mat));
|
||||
};
|
||||
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
// Building as part of OpenCV - follow OpenCV behavior In
|
||||
// the case of cv::Mat if output buffer is not enough to
|
||||
// hold the result, reallocate it
|
||||
if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
|
||||
auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
|
||||
createMat(desc, out_mat);
|
||||
}
|
||||
// In the case of RMat check to fit required meta
|
||||
else {
|
||||
check_rmat();
|
||||
}
|
||||
#else
|
||||
// Building standalone - output buffer should always exist,
|
||||
// and _exact_ match our inferred metadata
|
||||
if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
|
||||
auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
|
||||
GAPI_Assert(out_mat.data != nullptr &&
|
||||
desc.canDescribe(out_mat));
|
||||
}
|
||||
// In the case of RMat check to fit required meta
|
||||
else {
|
||||
check_rmat();
|
||||
}
|
||||
#endif // !defined(GAPI_STANDALONE)
|
||||
}
|
||||
}
|
||||
// Update storage with user-passed objects
|
||||
for (auto it : ade::util::zip(ade::util::toRange(proto.inputs),
|
||||
ade::util::toRange(args.inObjs))) {
|
||||
magazine::bindInArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
|
||||
}
|
||||
for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
|
||||
ade::util::toRange(args.outObjs))) {
|
||||
magazine::bindOutArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
|
||||
}
|
||||
|
||||
// Reset internal data
|
||||
for (auto &sd : m_slots) {
|
||||
const auto& data = m_gm.metadata(sd.data_nh).get<Data>();
|
||||
magazine::resetInternalData(m_state.mag, data);
|
||||
}
|
||||
|
||||
m_task_manager.scheduleAndWait(m_thread_pool);
|
||||
for (auto actor : m_actors) {
|
||||
actor->verify();
|
||||
}
|
||||
for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
|
||||
ade::util::toRange(args.outObjs))) {
|
||||
magazine::writeBackExec(m_state.mag, std::get<0>(it), std::get<1>(it));
|
||||
}
|
||||
}
|
||||
|
||||
bool cv::gimpl::GThreadedExecutor::canReshape() const {
|
||||
for (auto actor : m_actors) {
|
||||
if (actor->exec()->canReshape()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::reshape(const GMetaArgs& inMetas, const GCompileArgs& args) {
|
||||
GAPI_Assert(canReshape());
|
||||
auto& g = *m_orig_graph.get();
|
||||
ade::passes::PassContext ctx{g};
|
||||
passes::initMeta(ctx, inMetas);
|
||||
passes::inferMeta(ctx, true);
|
||||
|
||||
// NB: Before reshape islands need to re-init resources for every slot.
|
||||
for (auto slot : m_slots) {
|
||||
initResource(slot.slot_nh, slot.data_nh);
|
||||
}
|
||||
|
||||
for (auto actor : m_actors) {
|
||||
actor->exec()->reshape(g, args);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gimpl::GThreadedExecutor::prepareForNewStream() {
|
||||
for (auto actor : m_actors) {
|
||||
actor->exec()->handleNewStream();
|
||||
}
|
||||
}
|
123
modules/gapi/src/executor/gthreadedexecutor.hpp
Normal file
123
modules/gapi/src/executor/gthreadedexecutor.hpp
Normal file
@ -0,0 +1,123 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
|
||||
#ifndef OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
|
||||
#define OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
|
||||
|
||||
#include <utility> // tuple, required by magazine
|
||||
#include <unordered_map> // required by magazine
|
||||
|
||||
#include "executor/gabstractexecutor.hpp"
|
||||
#include "executor/thread_pool.hpp"
|
||||
|
||||
namespace cv {
|
||||
namespace gimpl {
|
||||
|
||||
class Task;
|
||||
class TaskManager {
|
||||
public:
|
||||
using F = std::function<void()>;
|
||||
|
||||
std::shared_ptr<Task> createTask(F &&f, std::vector<std::shared_ptr<Task>> &&producers);
|
||||
void scheduleAndWait(cv::gapi::own::ThreadPool& tp);
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<Task>> m_all_tasks;
|
||||
std::vector<std::shared_ptr<Task>> m_initial_tasks;
|
||||
};
|
||||
|
||||
struct GraphState {
|
||||
Mag mag;
|
||||
std::mutex m;
|
||||
};
|
||||
|
||||
class IslandActor;
|
||||
class GThreadedExecutor final: public GAbstractExecutor {
|
||||
public:
|
||||
class Input;
|
||||
class Output;
|
||||
|
||||
explicit GThreadedExecutor(const uint32_t num_threads,
|
||||
std::unique_ptr<ade::Graph> &&g_model);
|
||||
void run(cv::gimpl::GRuntimeArgs &&args) override;
|
||||
|
||||
bool canReshape() const override;
|
||||
void reshape(const GMetaArgs& inMetas, const GCompileArgs& args) override;
|
||||
|
||||
void prepareForNewStream() override;
|
||||
|
||||
private:
|
||||
struct DataDesc
|
||||
{
|
||||
ade::NodeHandle slot_nh;
|
||||
ade::NodeHandle data_nh;
|
||||
};
|
||||
|
||||
void initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh);
|
||||
|
||||
GraphState m_state;
|
||||
std::vector<DataDesc> m_slots;
|
||||
cv::gapi::own::ThreadPool m_thread_pool;
|
||||
TaskManager m_task_manager;
|
||||
std::vector<std::shared_ptr<IslandActor>> m_actors;
|
||||
};
|
||||
|
||||
class GThreadedExecutor::Input final: public GIslandExecutable::IInput
|
||||
{
|
||||
public:
|
||||
Input(GraphState& state, const std::vector<RcDesc> &rcs);
|
||||
|
||||
private:
|
||||
virtual StreamMsg get() override;
|
||||
virtual StreamMsg try_get() override { return get(); }
|
||||
|
||||
private:
|
||||
GraphState& m_state;
|
||||
};
|
||||
|
||||
class GThreadedExecutor::Output final: public GIslandExecutable::IOutput
|
||||
{
|
||||
public:
|
||||
Output(GraphState &state, const std::vector<RcDesc> &rcs);
|
||||
void verify();
|
||||
|
||||
private:
|
||||
GRunArgP get(int idx) override;
|
||||
void post(cv::GRunArgP&&, const std::exception_ptr& e) override;
|
||||
void post(Exception&& ex) override;
|
||||
void post(EndOfStream&&) override {};
|
||||
void meta(const GRunArgP &out, const GRunArg::Meta &m) override;
|
||||
|
||||
private:
|
||||
GraphState& m_state;
|
||||
std::unordered_map<const void*, int> m_out_idx;
|
||||
std::exception_ptr m_eptr;
|
||||
};
|
||||
|
||||
class IslandActor {
|
||||
public:
|
||||
using Ptr = std::shared_ptr<IslandActor>;
|
||||
IslandActor(const std::vector<RcDesc> &in_objects,
|
||||
const std::vector<RcDesc> &out_objects,
|
||||
std::shared_ptr<GIslandExecutable> isl_exec,
|
||||
GraphState &state);
|
||||
|
||||
void run();
|
||||
void verify();
|
||||
std::shared_ptr<GIslandExecutable> exec() { return m_isl_exec; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<GIslandExecutable> m_isl_exec;
|
||||
GThreadedExecutor::Input m_inputs;
|
||||
GThreadedExecutor::Output m_outputs;
|
||||
};
|
||||
|
||||
|
||||
} // namespace gimpl
|
||||
} // namespace cv
|
||||
|
||||
#endif // OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
|
67
modules/gapi/src/executor/thread_pool.cpp
Normal file
67
modules/gapi/src/executor/thread_pool.cpp
Normal file
@ -0,0 +1,67 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
|
||||
#include "thread_pool.hpp"
|
||||
|
||||
#include <opencv2/gapi/util/throw.hpp>
|
||||
|
||||
cv::gapi::own::Latch::Latch(const uint64_t expected)
|
||||
: m_expected(expected) {
|
||||
}
|
||||
|
||||
void cv::gapi::own::Latch::count_down() {
|
||||
std::lock_guard<std::mutex> lk{m_mutex};
|
||||
--m_expected;
|
||||
if (m_expected == 0) {
|
||||
m_all_done.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gapi::own::Latch::wait() {
|
||||
std::unique_lock<std::mutex> lk{m_mutex};
|
||||
while (m_expected != 0u) {
|
||||
m_all_done.wait(lk);
|
||||
}
|
||||
}
|
||||
|
||||
cv::gapi::own::ThreadPool::ThreadPool(const uint32_t num_workers) {
|
||||
m_workers.reserve(num_workers);
|
||||
for (uint32_t i = 0; i < num_workers; ++i) {
|
||||
m_workers.emplace_back(
|
||||
cv::gapi::own::ThreadPool::worker, std::ref(m_queue));
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gapi::own::ThreadPool::worker(QueueClass<Task>& queue) {
|
||||
while (true) {
|
||||
cv::gapi::own::ThreadPool::Task task;
|
||||
queue.pop(task);
|
||||
if (!task) {
|
||||
break;
|
||||
}
|
||||
task();
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gapi::own::ThreadPool::schedule(cv::gapi::own::ThreadPool::Task&& task) {
|
||||
m_queue.push(std::move(task));
|
||||
};
|
||||
|
||||
void cv::gapi::own::ThreadPool::shutdown() {
|
||||
for (size_t i = 0; i < m_workers.size(); ++i) {
|
||||
// NB: Empty task - is an indicator for workers to stop their loops
|
||||
m_queue.push({});
|
||||
}
|
||||
for (auto& worker : m_workers) {
|
||||
worker.join();
|
||||
}
|
||||
m_workers.clear();
|
||||
}
|
||||
|
||||
cv::gapi::own::ThreadPool::~ThreadPool() {
|
||||
shutdown();
|
||||
}
|
71
modules/gapi/src/executor/thread_pool.hpp
Normal file
71
modules/gapi/src/executor/thread_pool.hpp
Normal file
@ -0,0 +1,71 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
#ifndef OPENCV_GAPI_THREAD_POOL_HPP
|
||||
#define OPENCV_GAPI_THREAD_POOL_HPP
|
||||
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
|
||||
#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
|
||||
|
||||
#if defined(HAVE_TBB)
|
||||
# include <tbb/concurrent_queue.h> // FIXME: drop it from here!
|
||||
template<typename T> using QueueClass = tbb::concurrent_bounded_queue<T>;
|
||||
#else
|
||||
# include "executor/conc_queue.hpp"
|
||||
template<typename T> using QueueClass = cv::gapi::own::concurrent_bounded_queue<T>;
|
||||
#endif // TBB
|
||||
|
||||
namespace cv {
|
||||
namespace gapi {
|
||||
namespace own {
|
||||
|
||||
// NB: Only for tests
|
||||
class GAPI_EXPORTS Latch {
|
||||
public:
|
||||
explicit Latch(const uint64_t expected);
|
||||
|
||||
Latch(const Latch&) = delete;
|
||||
Latch& operator=(const Latch&) = delete;
|
||||
|
||||
void count_down();
|
||||
void wait();
|
||||
|
||||
private:
|
||||
uint64_t m_expected;
|
||||
std::mutex m_mutex;
|
||||
std::condition_variable m_all_done;
|
||||
};
|
||||
|
||||
// NB: Only for tests
|
||||
class GAPI_EXPORTS ThreadPool {
|
||||
public:
|
||||
using Task = std::function<void()>;
|
||||
explicit ThreadPool(const uint32_t num_workers);
|
||||
|
||||
ThreadPool(const ThreadPool&) = delete;
|
||||
ThreadPool& operator=(const ThreadPool&) = delete;
|
||||
|
||||
void schedule(Task&& task);
|
||||
~ThreadPool();
|
||||
|
||||
private:
|
||||
static void worker(QueueClass<Task>& queue);
|
||||
void shutdown();
|
||||
|
||||
private:
|
||||
std::vector<std::thread> m_workers;
|
||||
QueueClass<Task> m_queue;
|
||||
};
|
||||
|
||||
}}} // namespace cv::gapi::own
|
||||
|
||||
#endif // OPENCV_GAPI_THREAD_POOL_HPP
|
@ -13,6 +13,8 @@
|
||||
|
||||
#include <opencv2/gapi/core.hpp>
|
||||
|
||||
#include "executor/thread_pool.hpp"
|
||||
|
||||
namespace opencv_test
|
||||
{
|
||||
|
||||
@ -67,6 +69,38 @@ namespace
|
||||
}
|
||||
};
|
||||
|
||||
G_TYPED_KERNEL(GBusyWait, <GMat(GMat, uint32_t)>, "org.busy_wait") {
|
||||
static GMatDesc outMeta(GMatDesc in, uint32_t)
|
||||
{
|
||||
return in;
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_OCV_KERNEL(GOCVBusyWait, GBusyWait)
|
||||
{
|
||||
static void run(const cv::Mat& in,
|
||||
const uint32_t time_in_ms,
|
||||
cv::Mat& out)
|
||||
{
|
||||
using namespace std::chrono;
|
||||
auto s = high_resolution_clock::now();
|
||||
in.copyTo(out);
|
||||
auto e = high_resolution_clock::now();
|
||||
|
||||
const auto elapsed_in_ms =
|
||||
static_cast<int32_t>(duration_cast<milliseconds>(e-s).count());
|
||||
|
||||
int32_t diff = time_in_ms - elapsed_in_ms;
|
||||
const auto need_to_wait_in_ms = static_cast<uint32_t>(std::max(0, diff));
|
||||
|
||||
s = high_resolution_clock::now();
|
||||
e = s;
|
||||
while (duration_cast<milliseconds>(e-s).count() < need_to_wait_in_ms) {
|
||||
e = high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// These definitions test the correct macro work if the kernel has multiple output values
|
||||
G_TYPED_KERNEL(GRetGArrayTupleOfGMat2Kernel, <GArray<std::tuple<GMat, GMat>>(GMat, Scalar)>, "org.opencv.test.retarrayoftupleofgmat2kernel") {};
|
||||
G_TYPED_KERNEL(GRetGArraTupleyOfGMat3Kernel, <GArray<std::tuple<GMat, GMat, GMat>>(GMat)>, "org.opencv.test.retarrayoftupleofgmat3kernel") {};
|
||||
@ -513,4 +547,29 @@ TEST(DISABLED_GAPI_Pipeline, 1DMatWithinSingleIsland)
|
||||
EXPECT_EQ(0, cv::norm(out_mat, ref_mat));
|
||||
}
|
||||
|
||||
TEST(GAPI_Pipeline, BranchesExecutedInParallel)
|
||||
{
|
||||
cv::GMat in;
|
||||
// NB: cv::gapi::copy used to prevent fusing OCV backend operations
|
||||
// into the single island where they will be executed in turn
|
||||
auto out0 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
|
||||
auto out1 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
|
||||
auto out2 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
|
||||
auto out3 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
|
||||
|
||||
cv::GComputation comp(cv::GIn(in), cv::GOut(out0,out1,out2,out3));
|
||||
cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
|
||||
cv::Mat out_mat0, out_mat1, out_mat2, out_mat3;
|
||||
|
||||
using namespace std::chrono;
|
||||
auto s = high_resolution_clock::now();
|
||||
comp.apply(cv::gin(in_mat), cv::gout(out_mat0, out_mat1, out_mat2, out_mat3),
|
||||
cv::compile_args(cv::use_threaded_executor(4u),
|
||||
cv::gapi::kernels<GOCVBusyWait>()));
|
||||
auto e = high_resolution_clock::now();
|
||||
const auto elapsed_in_ms = duration_cast<milliseconds>(e-s).count();;
|
||||
|
||||
EXPECT_GE(1200u, elapsed_in_ms);
|
||||
}
|
||||
|
||||
} // namespace opencv_test
|
||||
|
124
modules/gapi/test/own/thread_pool_tests.cpp
Normal file
124
modules/gapi/test/own/thread_pool_tests.cpp
Normal file
@ -0,0 +1,124 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
|
||||
#include "../test_precomp.hpp"
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#include "executor/thread_pool.hpp"
|
||||
|
||||
namespace opencv_test
|
||||
{
|
||||
|
||||
using namespace cv::gapi;
|
||||
|
||||
TEST(ThreadPool, ScheduleNotBlock)
|
||||
{
|
||||
own::Latch latch(1u);
|
||||
std::atomic<uint32_t> counter{0u};
|
||||
|
||||
own::ThreadPool tp(4u);
|
||||
tp.schedule([&](){
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{500u});
|
||||
counter++;
|
||||
latch.count_down();
|
||||
});
|
||||
|
||||
EXPECT_EQ(0u, counter);
|
||||
latch.wait();
|
||||
EXPECT_EQ(1u, counter);
|
||||
}
|
||||
|
||||
TEST(ThreadPool, MultipleTasks)
|
||||
{
|
||||
const uint32_t kNumTasks = 100u;
|
||||
own::Latch latch(kNumTasks);
|
||||
std::atomic<uint32_t> completed{0u};
|
||||
|
||||
own::ThreadPool tp(4u);
|
||||
for (uint32_t i = 0; i < kNumTasks; ++i) {
|
||||
tp.schedule([&]() {
|
||||
++completed;
|
||||
latch.count_down();
|
||||
});
|
||||
}
|
||||
latch.wait();
|
||||
|
||||
EXPECT_EQ(kNumTasks, completed.load());
|
||||
}
|
||||
|
||||
struct ExecutionState {
|
||||
ExecutionState(const uint32_t num_threads,
|
||||
const uint32_t num_tasks)
|
||||
: guard(0u),
|
||||
critical(0u),
|
||||
limit(num_tasks),
|
||||
latch(num_threads),
|
||||
tp(num_threads) {
|
||||
}
|
||||
|
||||
std::atomic<uint32_t> guard;
|
||||
std::atomic<uint32_t> critical;
|
||||
const uint32_t limit;
|
||||
own::Latch latch;
|
||||
own::ThreadPool tp;
|
||||
};
|
||||
|
||||
static void doRecursive(ExecutionState& state) {
|
||||
// NB: Protects function to be executed no more than limit number of times
|
||||
if (state.guard.fetch_add(1u) >= state.limit) {
|
||||
state.latch.count_down();
|
||||
return;
|
||||
}
|
||||
// NB: This simulates critical section
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{50});
|
||||
++state.critical;
|
||||
// NB: Schedule the new one recursively
|
||||
state.tp.schedule([&](){ doRecursive(state); });
|
||||
}
|
||||
|
||||
TEST(ThreadPool, ScheduleRecursively)
|
||||
{
|
||||
const int kNumThreads = 5u;
|
||||
const uint32_t kNumTasks = 100u;
|
||||
|
||||
ExecutionState state(kNumThreads, kNumTasks);
|
||||
for (uint32_t i = 0; i < kNumThreads; ++i) {
|
||||
state.tp.schedule([&](){
|
||||
doRecursive(state);
|
||||
});
|
||||
}
|
||||
state.latch.wait();
|
||||
|
||||
EXPECT_EQ(kNumTasks, state.critical.load());
|
||||
}
|
||||
|
||||
TEST(ThreadPool, ExecutionIsParallel)
|
||||
{
|
||||
const uint32_t kNumThreads = 4u;
|
||||
std::atomic<uint32_t> counter{0};
|
||||
own::Latch latch{kNumThreads};
|
||||
|
||||
own::ThreadPool tp(kNumThreads);
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
for (uint32_t i = 0; i < kNumThreads; ++i) {
|
||||
tp.schedule([&]() {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{800u});
|
||||
++counter;
|
||||
latch.count_down();
|
||||
});
|
||||
}
|
||||
latch.wait();
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
||||
|
||||
EXPECT_GE(1000u, elapsed);
|
||||
EXPECT_EQ(kNumThreads, counter.load());
|
||||
}
|
||||
|
||||
} // namespace opencv_test
|
@ -322,7 +322,8 @@ CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
|
||||
@param buf Input array or vector of bytes.
|
||||
@param flags The same flags as in cv::imread, see cv::ImreadModes.
|
||||
@param dst The optional output placeholder for the decoded matrix. It can save the image
|
||||
reallocations when the function is called repeatedly for images of the same size.
|
||||
reallocations when the function is called repeatedly for images of the same size. In case of decoder
|
||||
failure the function returns empty cv::Mat object, but does not release user-provided dst buffer.
|
||||
*/
|
||||
CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);
|
||||
|
||||
|
@ -782,7 +782,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
|
||||
|
||||
ImageDecoder decoder = findDecoder(buf_row);
|
||||
if( !decoder )
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
int scale_denom = 1;
|
||||
if( flags > IMREAD_LOAD_GDAL )
|
||||
@ -803,7 +803,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
|
||||
filename = tempfile();
|
||||
FILE* f = fopen( filename.c_str(), "wb" );
|
||||
if( !f )
|
||||
return 0;
|
||||
return false;
|
||||
size_t bufSize = buf_row.total()*buf.elemSize();
|
||||
if (fwrite(buf_row.ptr(), 1, bufSize, f) != bufSize)
|
||||
{
|
||||
@ -841,7 +841,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
|
||||
CV_LOG_WARNING(NULL, "unable to remove temporary file:" << filename);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// established the required input image size
|
||||
@ -887,7 +887,6 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
|
||||
|
||||
if (!success)
|
||||
{
|
||||
mat.release();
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -911,7 +910,8 @@ Mat imdecode( InputArray _buf, int flags )
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
Mat buf = _buf.getMat(), img;
|
||||
imdecode_( buf, flags, img );
|
||||
if (!imdecode_(buf, flags, img))
|
||||
img.release();
|
||||
|
||||
return img;
|
||||
}
|
||||
@ -922,9 +922,10 @@ Mat imdecode( InputArray _buf, int flags, Mat* dst )
|
||||
|
||||
Mat buf = _buf.getMat(), img;
|
||||
dst = dst ? dst : &img;
|
||||
imdecode_( buf, flags, *dst );
|
||||
|
||||
return *dst;
|
||||
if (imdecode_(buf, flags, *dst))
|
||||
return *dst;
|
||||
else
|
||||
return cv::Mat();
|
||||
}
|
||||
|
||||
static bool
|
||||
|
@ -482,6 +482,19 @@ TEST(Imgcodecs, write_parameter_type)
|
||||
EXPECT_EQ(0, remove(tmp_file.c_str()));
|
||||
}
|
||||
|
||||
TEST(Imgcodecs, imdecode_user_buffer)
|
||||
{
|
||||
cv::Mat encoded = cv::Mat::zeros(1, 1024, CV_8UC1);
|
||||
cv::Mat user_buffer(1, 1024, CV_8UC1);
|
||||
cv::Mat result = cv::imdecode(encoded, IMREAD_ANYCOLOR, &user_buffer);
|
||||
EXPECT_TRUE(result.empty());
|
||||
// the function does not release user-provided buffer
|
||||
EXPECT_FALSE(user_buffer.empty());
|
||||
|
||||
result = cv::imdecode(encoded, IMREAD_ANYCOLOR);
|
||||
EXPECT_TRUE(result.empty());
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
|
||||
#if defined(HAVE_OPENEXR) && defined(OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS)
|
||||
|
@ -153,7 +153,6 @@ set(depends ${the_module}_android_source_copy "${OPENCV_DEPHELPER}/${the_module}
|
||||
|
||||
# build jar
|
||||
set(AAR_FILE "${OPENCV_JAVA_DIR}/build/outputs/aar/opencv-release.aar")
|
||||
ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
|
||||
add_custom_command(
|
||||
OUTPUT "${AAR_FILE}" "${OPENCV_DEPHELPER}/${the_module}_android"
|
||||
COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "opencv:assemble"
|
||||
|
@ -1,9 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" path="src"/>
|
||||
<classpathentry kind="src" path="gen"/>
|
||||
<classpathentry kind="con" path="com.android.ide.eclipse.adt.ANDROID_FRAMEWORK"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry kind="con" path="com.android.ide.eclipse.adt.LIBRARIES"/>
|
||||
<classpathentry kind="output" path="bin/classes"/>
|
||||
</classpath>
|
@ -1,33 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>OpenCV_JavaAPI_Tests</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>com.android.ide.eclipse.adt.ApkBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user