Merge remote-tracking branch 'upstream/3.4' into merge-3.4
26
3rdparty/ippicv/ippicv.cmake
vendored
@ -2,37 +2,37 @@ function(download_ippicv root_var)
|
||||
set(${root_var} "" PARENT_SCOPE)
|
||||
|
||||
# Commit SHA in the opencv_3rdparty repo
|
||||
set(IPPICV_COMMIT "dfe3162c237af211e98b8960018b564bc209261d")
|
||||
set(IPPICV_COMMIT "bdb7bb85f34a8cb0d35e40a81f58da431aa1557a")
|
||||
# Define actual ICV versions
|
||||
if(APPLE)
|
||||
set(OPENCV_ICV_PLATFORM "macosx")
|
||||
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
|
||||
if(X86_64)
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20170822.tgz")
|
||||
set(OPENCV_ICV_HASH "c1ebb5dfa5b7f54b0c44e1917805a463")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20180518.tgz")
|
||||
set(OPENCV_ICV_HASH "3ae52b9be0fe73dd45bc5e9429cd3732")
|
||||
else()
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20170822.tgz")
|
||||
set(OPENCV_ICV_HASH "49b05a669042753ae75895a445ebd612")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20180518.tgz")
|
||||
set(OPENCV_ICV_HASH "698660b975b62bee3ef6c5af51e97544")
|
||||
endif()
|
||||
elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
|
||||
set(OPENCV_ICV_PLATFORM "linux")
|
||||
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
|
||||
if(X86_64)
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20170822.tgz")
|
||||
set(OPENCV_ICV_HASH "4e0352ce96473837b1d671ce87f17359")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20180518.tgz")
|
||||
set(OPENCV_ICV_HASH "b7cc351267db2d34b9efa1cd22ff0572")
|
||||
else()
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20170822.tgz")
|
||||
set(OPENCV_ICV_HASH "dcdb0ba4b123f240596db1840cd59a76")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20180518.tgz")
|
||||
set(OPENCV_ICV_HASH "ea72de74dae3c604eb6348395366e78e")
|
||||
endif()
|
||||
elseif(WIN32 AND NOT ARM)
|
||||
set(OPENCV_ICV_PLATFORM "windows")
|
||||
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
|
||||
if(X86_64)
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20170822.zip")
|
||||
set(OPENCV_ICV_HASH "0421e642bc7ad741a2236d3ec4190bdd")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20180518.zip")
|
||||
set(OPENCV_ICV_HASH "915ff92958089ede8ea532d3c4fe7187")
|
||||
else()
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20170822.zip")
|
||||
set(OPENCV_ICV_HASH "8a7680ae352c192de2e2e34936164bd0")
|
||||
set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20180518.zip")
|
||||
set(OPENCV_ICV_HASH "928168c2d99ab284047dfcfb7a821d91")
|
||||
endif()
|
||||
else()
|
||||
return()
|
||||
|
2
3rdparty/libtiff/CMakeLists.txt
vendored
@ -417,7 +417,7 @@ set(lib_srcs
|
||||
tif_write.c
|
||||
tif_zip.c
|
||||
tif_stream.cxx
|
||||
snprintf.c
|
||||
snprintf.c
|
||||
t4.h
|
||||
tif_dir.h
|
||||
tif_fax3.h
|
||||
|
@ -32,11 +32,11 @@ Unspecified error: Can't create layer "layer_name" of type "MyType" in function
|
||||
To import the model correctly you have to derive a class from cv::dnn::Layer with
|
||||
the following methods:
|
||||
|
||||
@snippet dnn/custom_layers.cpp A custom layer interface
|
||||
@snippet dnn/custom_layers.hpp A custom layer interface
|
||||
|
||||
And register it before the import:
|
||||
|
||||
@snippet dnn/custom_layers.cpp Register a custom layer
|
||||
@snippet dnn/custom_layers.hpp Register a custom layer
|
||||
|
||||
@note `MyType` is a type of unimplemented layer from the thrown exception.
|
||||
|
||||
@ -44,27 +44,27 @@ Let's see what all the methods do:
|
||||
|
||||
- Constructor
|
||||
|
||||
@snippet dnn/custom_layers.cpp MyLayer::MyLayer
|
||||
@snippet dnn/custom_layers.hpp MyLayer::MyLayer
|
||||
|
||||
Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable
|
||||
weights they will be already stored in the Layer's member cv::dnn::Layer::blobs.
|
||||
|
||||
- A static method `create`
|
||||
|
||||
@snippet dnn/custom_layers.cpp MyLayer::create
|
||||
@snippet dnn/custom_layers.hpp MyLayer::create
|
||||
|
||||
This method should create an instance of you layer and return cv::Ptr with it.
|
||||
|
||||
- Output blobs' shape computation
|
||||
|
||||
@snippet dnn/custom_layers.cpp MyLayer::getMemoryShapes
|
||||
@snippet dnn/custom_layers.hpp MyLayer::getMemoryShapes
|
||||
|
||||
Returns layer's output shapes depends on input shapes. You may request an extra
|
||||
memory using `internals`.
|
||||
|
||||
- Run a layer
|
||||
|
||||
@snippet dnn/custom_layers.cpp MyLayer::forward
|
||||
@snippet dnn/custom_layers.hpp MyLayer::forward
|
||||
|
||||
Implement a layer's logic here. Compute outputs for given inputs.
|
||||
|
||||
@ -74,7 +74,7 @@ the second invocation of `forward` will has the same data at `outputs` and `inte
|
||||
|
||||
- Optional `finalize` method
|
||||
|
||||
@snippet dnn/custom_layers.cpp MyLayer::finalize
|
||||
@snippet dnn/custom_layers.hpp MyLayer::finalize
|
||||
|
||||
The chain of methods are the following: OpenCV deep learning engine calls `create`
|
||||
method once then it calls `getMemoryShapes` for an every created layer then you
|
||||
@ -108,11 +108,11 @@ layer {
|
||||
|
||||
This way our implementation can look like:
|
||||
|
||||
@snippet dnn/custom_layers.cpp InterpLayer
|
||||
@snippet dnn/custom_layers.hpp InterpLayer
|
||||
|
||||
Next we need to register a new layer type and try to import the model.
|
||||
|
||||
@snippet dnn/custom_layers.cpp Register InterpLayer
|
||||
@snippet dnn/custom_layers.hpp Register InterpLayer
|
||||
|
||||
## Example: custom layer from TensorFlow
|
||||
This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear)
|
||||
@ -185,11 +185,11 @@ Custom layers import from TensorFlow is designed to put all layer's `attr` into
|
||||
cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs.
|
||||
In our case resize's output shape will be stored in layer's `blobs[0]`.
|
||||
|
||||
@snippet dnn/custom_layers.cpp ResizeBilinearLayer
|
||||
@snippet dnn/custom_layers.hpp ResizeBilinearLayer
|
||||
|
||||
Next we register a layer and try to import the model.
|
||||
|
||||
@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer
|
||||
@snippet dnn/custom_layers.hpp Register ResizeBilinearLayer
|
||||
|
||||
## Define a custom layer in Python
|
||||
The following example shows how to customize OpenCV's layers in Python.
|
||||
|
@ -5,6 +5,8 @@ This section contains tutorials about how to use the built-in graphical user int
|
||||
|
||||
- @subpage tutorial_trackbar
|
||||
|
||||
*Languages:* C++, Java, Python
|
||||
|
||||
*Compatibility:* \> OpenCV 2.0
|
||||
|
||||
*Author:* Ana Huamán
|
||||
|
@ -1,11 +1,11 @@
|
||||
Adding a Trackbar to our applications! {#tutorial_trackbar}
|
||||
======================================
|
||||
|
||||
- In the previous tutorials (about *linear blending* and the *brightness and contrast
|
||||
adjustments*) you might have noted that we needed to give some **input** to our programs, such
|
||||
as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal
|
||||
- Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (*highgui.hpp*)
|
||||
for you. An example of this is a **Trackbar**
|
||||
- In the previous tutorials (about @ref tutorial_adding_images and the @ref tutorial_basic_linear_transform)
|
||||
you might have noted that we needed to give some **input** to our programs, such
|
||||
as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal.
|
||||
- Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (**highgui** module)
|
||||
for you. An example of this is a **Trackbar**.
|
||||
|
||||

|
||||
|
||||
@ -24,26 +24,73 @@ Code
|
||||
|
||||
Let's modify the program made in the tutorial @ref tutorial_adding_images. We will let the user enter the
|
||||
\f$\alpha\f$ value by using the Trackbar.
|
||||
|
||||
@add_toggle_cpp
|
||||
This tutorial code's is shown lines below. You can also download it from
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp)
|
||||
@include cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
This tutorial code's is shown lines below. You can also download it from
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java)
|
||||
@include java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
This tutorial code's is shown lines below. You can also download it from
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py)
|
||||
@include python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
|
||||
@end_toggle
|
||||
|
||||
Explanation
|
||||
-----------
|
||||
|
||||
We only analyze the code that is related to Trackbar:
|
||||
|
||||
-# First, we load two images, which are going to be blended.
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
|
||||
- First, we load two images, which are going to be blended.
|
||||
|
||||
-# To create a trackbar, first we have to create the window in which it is going to be located. So:
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
|
||||
@add_toggle_cpp
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
|
||||
@end_toggle
|
||||
|
||||
-# Now we can create the Trackbar:
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
|
||||
@add_toggle_java
|
||||
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java load
|
||||
@end_toggle
|
||||
|
||||
Note the following:
|
||||
@add_toggle_python
|
||||
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py load
|
||||
@end_toggle
|
||||
|
||||
- To create a trackbar, first we have to create the window in which it is going to be located. So:
|
||||
|
||||
@add_toggle_cpp
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java window
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py window
|
||||
@end_toggle
|
||||
|
||||
- Now we can create the Trackbar:
|
||||
|
||||
@add_toggle_cpp
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java create_trackbar
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py create_trackbar
|
||||
@end_toggle
|
||||
|
||||
Note the following (C++ code):
|
||||
- Our Trackbar has a label **TrackbarName**
|
||||
- The Trackbar is located in the window named **Linear Blend**
|
||||
- The Trackbar values will be in the range from \f$0\f$ to **alpha_slider_max** (the minimum
|
||||
@ -51,10 +98,21 @@ We only analyze the code that is related to Trackbar:
|
||||
- The numerical value of Trackbar is stored in **alpha_slider**
|
||||
- Whenever the user moves the Trackbar, the callback function **on_trackbar** is called
|
||||
|
||||
-# Finally, we have to define the callback function **on_trackbar**
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
|
||||
Finally, we have to define the callback function **on_trackbar** for C++ and Python code, using an anonymous inner class listener in Java
|
||||
|
||||
Note that:
|
||||
@add_toggle_cpp
|
||||
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java on_trackbar
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py on_trackbar
|
||||
@end_toggle
|
||||
|
||||
Note that (C++ code):
|
||||
- We use the value of **alpha_slider** (integer) to get a double value for **alpha**.
|
||||
- **alpha_slider** is updated each time the trackbar is displaced by the user.
|
||||
- We define *src1*, *src2*, *dist*, *alpha*, *alpha_slider* and *beta* as global variables,
|
||||
|
@ -11,9 +11,6 @@ In this tutorial you will learn how to:
|
||||
- @ref cv::erode
|
||||
- @ref cv::dilate
|
||||
|
||||
Interesting fact
|
||||
-----------
|
||||
|
||||
@note The explanation below belongs to the book **Learning OpenCV** by Bradski and Kaehler.
|
||||
|
||||
Morphological Operations
|
||||
@ -38,19 +35,14 @@ Morphological Operations
|
||||
- As the kernel \f$B\f$ is scanned over the image, we compute the maximal pixel value overlapped by
|
||||
\f$B\f$ and replace the image pixel in the anchor point position with that maximal value. As you can
|
||||
deduce, this maximizing operation causes bright regions within an image to "grow" (therefore the
|
||||
name *dilation*). Take the above image as an example. Applying dilation we can get:
|
||||
name *dilation*).
|
||||
- The dilatation operation is: \f$\texttt{dst} (x,y) = \max _{(x',y'): \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
|
||||
|
||||
- Take the above image as an example. Applying dilation we can get:
|
||||
|
||||

|
||||
|
||||
The background (bright) dilates around the black regions of the letter.
|
||||
|
||||
To better grasp the idea and avoid possible confusion, in this other example we have inverted the original
|
||||
image such as the object in white is now the letter. We have performed two dilatations with a rectangular
|
||||
structuring element of size `3x3`.
|
||||
|
||||

|
||||
|
||||
The dilatation makes the object in white bigger.
|
||||
- The bright area of the letter dilates around the black regions of the background.
|
||||
|
||||
### Erosion
|
||||
|
||||
@ -58,31 +50,39 @@ The dilatation makes the object in white bigger.
|
||||
area of given kernel.
|
||||
- As the kernel \f$B\f$ is scanned over the image, we compute the minimal pixel value overlapped by
|
||||
\f$B\f$ and replace the image pixel under the anchor point with that minimal value.
|
||||
- The erosion operation is: \f$\texttt{dst} (x,y) = \min _{(x',y'): \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
|
||||
- Analagously to the example for dilation, we can apply the erosion operator to the original image
|
||||
(shown above). You can see in the result below that the bright areas of the image (the
|
||||
background, apparently), get thinner, whereas the dark zones (the "writing") gets bigger.
|
||||
(shown above). You can see in the result below that the bright areas of the image get thinner,
|
||||
whereas the dark zones gets bigger.
|
||||
|
||||

|
||||
|
||||
In similar manner, the corresponding image results by applying erosion operation on the inverted original image (two erosions
|
||||
with a rectangular structuring element of size `3x3`):
|
||||
|
||||

|
||||
|
||||
The erosion makes the object in white smaller.
|
||||
|
||||
Code
|
||||
----
|
||||
|
||||
@add_toggle_cpp
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp)
|
||||
@include samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java)
|
||||
@include samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py)
|
||||
@include samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
|
||||
@end_toggle
|
||||
|
||||
Explanation
|
||||
-----------
|
||||
|
||||
-# Most of the material shown here is trivial (if you have any doubt, please refer to the tutorials in
|
||||
previous sections). Let's check the general structure of the program:
|
||||
previous sections). Let's check the general structure of the C++ program:
|
||||
|
||||
- Load an image (can be BGR or grayscale)
|
||||
- Create two windows (one for dilation output, the other for erosion)
|
||||
|
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 410 B After Width: | Height: | Size: 923 B |
Before Width: | Height: | Size: 457 B After Width: | Height: | Size: 844 B |
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 458 B After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 685 B After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 558 B After Width: | Height: | Size: 2.2 KiB |
Before Width: | Height: | Size: 1.4 KiB |
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 608 B After Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 617 B After Width: | Height: | Size: 1.5 KiB |
@ -36,15 +36,10 @@ discuss briefly 5 operations offered by OpenCV:
|
||||
foreground)
|
||||
- For instance, check out the example below. The image at the left is the original and the image
|
||||
at the right is the result after applying the opening transformation. We can observe that the
|
||||
small spaces in the corners of the letter tend to disappear.
|
||||
small dots have disappeared.
|
||||
|
||||

|
||||
|
||||
For the sake of clarity, we have performed the opening operation (`7x7` rectangular structuring element)
|
||||
on the same original image but inverted such as the object in white is now the letter.
|
||||
|
||||

|
||||
|
||||
### Closing
|
||||
|
||||
- It is obtained by the dilation of an image followed by an erosion.
|
||||
@ -55,10 +50,6 @@ on the same original image but inverted such as the object in white is now the l
|
||||
|
||||

|
||||
|
||||
On the inverted image, we have performed the closing operation (`7x7` rectangular structuring element):
|
||||
|
||||

|
||||
|
||||
### Morphological Gradient
|
||||
|
||||
- It is the difference between the dilation and the erosion of an image.
|
||||
@ -88,14 +79,28 @@ On the inverted image, we have performed the closing operation (`7x7` rectangula
|
||||
Code
|
||||
----
|
||||
|
||||
This tutorial code's is shown lines below. You can also download it from
|
||||
@add_toggle_cpp
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp)
|
||||
@include cpp/tutorial_code/ImgProc/Morphology_2.cpp
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_java
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java)
|
||||
@include java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
|
||||
@end_toggle
|
||||
|
||||
@add_toggle_python
|
||||
This tutorial's code is shown below. You can also download it
|
||||
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py)
|
||||
@include python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
|
||||
@end_toggle
|
||||
|
||||
Explanation
|
||||
-----------
|
||||
|
||||
-# Let's check the general structure of the program:
|
||||
-# Let's check the general structure of the C++ program:
|
||||
- Load an image
|
||||
- Create a window to display results of the Morphological operations
|
||||
- Create three Trackbars for the user to enter parameters:
|
||||
@ -139,8 +144,8 @@ Explanation
|
||||
Results
|
||||
-------
|
||||
|
||||
- After compiling the code above we can execute it giving an image path as an argument. For this
|
||||
tutorial we use as input the image: **baboon.png**:
|
||||
- After compiling the code above we can execute it giving an image path as an argument. Results using
|
||||
the image: **baboon.png**:
|
||||
|
||||

|
||||
|
||||
|
@ -305,6 +305,9 @@ public:
|
||||
//! returns true if GpuMat data is NULL
|
||||
bool empty() const;
|
||||
|
||||
//! internal use method: updates the continuity flag
|
||||
void updateContinuityFlag();
|
||||
|
||||
/*! includes several bit-fields:
|
||||
- the magic signature
|
||||
- continuity flag
|
||||
|
@ -2084,6 +2084,9 @@ public:
|
||||
static MatAllocator* getDefaultAllocator();
|
||||
static void setDefaultAllocator(MatAllocator* allocator);
|
||||
|
||||
//! internal use method: updates the continuity flag
|
||||
void updateContinuityFlag();
|
||||
|
||||
//! interaction with UMat
|
||||
UMatData* u;
|
||||
|
||||
@ -2551,6 +2554,9 @@ public:
|
||||
//! and the standard allocator
|
||||
static MatAllocator* getStdAllocator();
|
||||
|
||||
//! internal use method: updates the continuity flag
|
||||
void updateContinuityFlag();
|
||||
|
||||
// black-box container of UMat data
|
||||
UMatData* u;
|
||||
|
||||
|
@ -495,24 +495,20 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
|
||||
if( _step == AUTO_STEP )
|
||||
{
|
||||
_step = minstep;
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_DbgAssert( _step >= minstep );
|
||||
|
||||
if (_step % esz1 != 0)
|
||||
{
|
||||
CV_Error(Error::BadStep, "Step must be a multiple of esz1");
|
||||
}
|
||||
|
||||
if (_step == minstep || rows == 1)
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
}
|
||||
step[0] = _step;
|
||||
step[1] = esz;
|
||||
datalimit = datastart + _step * rows;
|
||||
dataend = datalimit - _step + minstep;
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
inline
|
||||
@ -528,7 +524,6 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
|
||||
if( _step == AUTO_STEP )
|
||||
{
|
||||
_step = minstep;
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -538,14 +533,12 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
|
||||
{
|
||||
CV_Error(Error::BadStep, "Step must be a multiple of esz1");
|
||||
}
|
||||
|
||||
if (_step == minstep || rows == 1)
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
}
|
||||
step[0] = _step;
|
||||
step[1] = esz;
|
||||
datalimit = datastart + _step*rows;
|
||||
dataend = datalimit - _step + minstep;
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
template<typename _Tp> inline
|
||||
|
@ -152,7 +152,7 @@ namespace cv { namespace cuda
|
||||
|
||||
inline ~NppStreamHandler()
|
||||
{
|
||||
nppSetStream(oldStream);
|
||||
cudaStreamSynchronize(oldStream);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -489,7 +489,7 @@ public class MatTest extends OpenCVTestCase {
|
||||
public void testIsContinuous() {
|
||||
assertTrue(gray0.isContinuous());
|
||||
|
||||
Mat subMat = gray0.submat(0, 0, gray0.rows() / 2, gray0.cols() / 2);
|
||||
Mat subMat = gray0.submat(0, gray0.rows() / 2, 0, gray0.cols() / 2);
|
||||
assertFalse(subMat.isContinuous());
|
||||
}
|
||||
|
||||
@ -937,7 +937,7 @@ public class MatTest extends OpenCVTestCase {
|
||||
}
|
||||
|
||||
public void testSubmatRect() {
|
||||
Mat submat = gray255.submat(new Rect(5, gray255.rows() / 2, 5, gray255.cols() / 2));
|
||||
Mat submat = gray255.submat(new Rect(5, 5, gray255.cols() / 2, gray255.rows() / 2));
|
||||
assertTrue(submat.isSubmatrix());
|
||||
assertFalse(submat.isContinuous());
|
||||
|
||||
|
@ -46,6 +46,13 @@
|
||||
using namespace cv;
|
||||
using namespace cv::cuda;
|
||||
|
||||
void cv::cuda::GpuMat::updateContinuityFlag()
|
||||
{
|
||||
int sz[] = { rows, cols };
|
||||
size_t steps[] = { step, elemSize() };
|
||||
flags = cv::updateContinuityFlag(flags, 2, sz, steps);
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
|
||||
flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
|
||||
step(step_), data((uchar*)data_), refcount(0),
|
||||
@ -57,7 +64,6 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
|
||||
if (step == Mat::AUTO_STEP)
|
||||
{
|
||||
step = minstep;
|
||||
flags |= Mat::CONTINUOUS_FLAG;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -65,11 +71,10 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
|
||||
step = minstep;
|
||||
|
||||
CV_DbgAssert( step >= minstep );
|
||||
|
||||
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
|
||||
}
|
||||
|
||||
dataend += step * (rows - 1) + minstep;
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
|
||||
@ -83,7 +88,6 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
|
||||
if (step == Mat::AUTO_STEP)
|
||||
{
|
||||
step = minstep;
|
||||
flags |= Mat::CONTINUOUS_FLAG;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -91,11 +95,10 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
|
||||
step = minstep;
|
||||
|
||||
CV_DbgAssert( step >= minstep );
|
||||
|
||||
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
|
||||
}
|
||||
|
||||
dataend += step * (rows - 1) + minstep;
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
|
||||
@ -127,17 +130,15 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
|
||||
|
||||
cols = colRange_.size();
|
||||
data += colRange_.start*elemSize();
|
||||
flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
|
||||
}
|
||||
|
||||
if (rows == 1)
|
||||
flags |= Mat::CONTINUOUS_FLAG;
|
||||
|
||||
if (refcount)
|
||||
CV_XADD(refcount, 1);
|
||||
|
||||
if (rows <= 0 || cols <= 0)
|
||||
rows = cols = 0;
|
||||
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
|
||||
@ -146,16 +147,19 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
|
||||
datastart(m.datastart), dataend(m.dataend),
|
||||
allocator(m.allocator)
|
||||
{
|
||||
flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
|
||||
data += roi.x * elemSize();
|
||||
|
||||
CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows );
|
||||
CV_Assert( 0 <= roi.x && 0 <= roi.width &&
|
||||
roi.x + roi.width <= m.cols &&
|
||||
0 <= roi.y && 0 <= roi.height &&
|
||||
roi.y + roi.height <= m.rows );
|
||||
|
||||
if (refcount)
|
||||
CV_XADD(refcount, 1);
|
||||
|
||||
if (rows <= 0 || cols <= 0)
|
||||
rows = cols = 0;
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const
|
||||
@ -245,11 +249,7 @@ GpuMat& cv::cuda::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright
|
||||
rows = row2 - row1;
|
||||
cols = col2 - col1;
|
||||
|
||||
if (esz * cols == step || rows == 1)
|
||||
flags |= Mat::CONTINUOUS_FLAG;
|
||||
else
|
||||
flags &= ~Mat::CONTINUOUS_FLAG;
|
||||
|
||||
updateContinuityFlag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -201,10 +201,13 @@ void cv::cuda::HostMem::create(int rows_, int cols_, int type_)
|
||||
|
||||
if (rows_ > 0 && cols_ > 0)
|
||||
{
|
||||
flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_;
|
||||
flags = Mat::MAGIC_VAL + type_;
|
||||
rows = rows_;
|
||||
cols = cols_;
|
||||
step = elemSize() * cols;
|
||||
int sz[] = { rows, cols };
|
||||
size_t steps[] = { step, CV_ELEM_SIZE(type_) };
|
||||
flags = updateContinuityFlag(flags, 2, sz, steps);
|
||||
|
||||
if (alloc_type == SHARED)
|
||||
{
|
||||
|
@ -594,10 +594,11 @@ namespace
|
||||
|
||||
StackAllocator::~StackAllocator()
|
||||
{
|
||||
cudaStreamSynchronize(stream_);
|
||||
|
||||
if (memStack_ != 0)
|
||||
{
|
||||
cudaStreamSynchronize(stream_);
|
||||
memStack_->pool->returnMemStack(memStack_);
|
||||
}
|
||||
}
|
||||
|
||||
size_t alignUp(size_t what, size_t alignment)
|
||||
|
@ -262,31 +262,36 @@ void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool auto
|
||||
}
|
||||
}
|
||||
|
||||
static void updateContinuityFlag(Mat& m)
|
||||
int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
|
||||
{
|
||||
int i, j;
|
||||
for( i = 0; i < m.dims; i++ )
|
||||
for( i = 0; i < dims; i++ )
|
||||
{
|
||||
if( m.size[i] > 1 )
|
||||
if( size[i] > 1 )
|
||||
break;
|
||||
}
|
||||
|
||||
for( j = m.dims-1; j > i; j-- )
|
||||
uint64 t = (uint64)size[std::min(i, dims-1)]*CV_MAT_CN(flags);
|
||||
for( j = dims-1; j > i; j-- )
|
||||
{
|
||||
if( m.step[j]*m.size[j] < m.step[j-1] )
|
||||
t *= size[j];
|
||||
if( step[j]*size[j] < step[j-1] )
|
||||
break;
|
||||
}
|
||||
|
||||
uint64 t = (uint64)m.step[0]*m.size[0];
|
||||
if( j <= i && t == (size_t)t )
|
||||
m.flags |= Mat::CONTINUOUS_FLAG;
|
||||
else
|
||||
m.flags &= ~Mat::CONTINUOUS_FLAG;
|
||||
if( j <= i && t == (uint64)(int)t )
|
||||
return flags | Mat::CONTINUOUS_FLAG;
|
||||
return flags & ~Mat::CONTINUOUS_FLAG;
|
||||
}
|
||||
|
||||
void Mat::updateContinuityFlag()
|
||||
{
|
||||
flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
|
||||
}
|
||||
|
||||
void finalizeHdr(Mat& m)
|
||||
{
|
||||
updateContinuityFlag(m);
|
||||
m.updateContinuityFlag();
|
||||
int d = m.dims;
|
||||
if( d > 2 )
|
||||
m.rows = m.cols = -1;
|
||||
@ -427,7 +432,6 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
|
||||
&& _colRange.end <= m.cols );
|
||||
cols = _colRange.size();
|
||||
data += _colRange.start*elemSize();
|
||||
flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
}
|
||||
@ -437,8 +441,7 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
|
||||
CV_RETHROW();
|
||||
}
|
||||
|
||||
if( rows == 1 )
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
updateContinuityFlag();
|
||||
|
||||
if( rows <= 0 || cols <= 0 )
|
||||
{
|
||||
@ -455,8 +458,6 @@ Mat::Mat(const Mat& m, const Rect& roi)
|
||||
allocator(m.allocator), u(m.u), size(&rows)
|
||||
{
|
||||
CV_Assert( m.dims <= 2 );
|
||||
flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
|
||||
flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
|
||||
|
||||
size_t esz = CV_ELEM_SIZE(flags);
|
||||
data += roi.x*esz;
|
||||
@ -468,6 +469,7 @@ Mat::Mat(const Mat& m, const Rect& roi)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
|
||||
step[0] = m.step[0]; step[1] = esz;
|
||||
updateContinuityFlag();
|
||||
|
||||
if( rows <= 0 || cols <= 0 )
|
||||
{
|
||||
@ -522,7 +524,7 @@ Mat::Mat(const Mat& m, const Range* ranges)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
}
|
||||
updateContinuityFlag(*this);
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
|
||||
@ -548,7 +550,7 @@ Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
}
|
||||
updateContinuityFlag(*this);
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
|
||||
@ -575,10 +577,7 @@ Mat Mat::diag(int d) const
|
||||
m.size[1] = m.cols = 1;
|
||||
m.step[0] += (len > 1 ? esz : 0);
|
||||
|
||||
if( m.rows > 1 )
|
||||
m.flags &= ~CONTINUOUS_FLAG;
|
||||
else
|
||||
m.flags |= CONTINUOUS_FLAG;
|
||||
m.updateContinuityFlag();
|
||||
|
||||
if( size() != Size(1,1) )
|
||||
m.flags |= SUBMATRIX_FLAG;
|
||||
@ -597,13 +596,6 @@ void Mat::pop_back(size_t nelems)
|
||||
{
|
||||
size.p[0] -= (int)nelems;
|
||||
dataend -= nelems*step.p[0];
|
||||
/*if( size.p[0] <= 1 )
|
||||
{
|
||||
if( dims <= 2 )
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
else
|
||||
updateContinuityFlag(*this);
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
@ -618,7 +610,10 @@ void Mat::push_back_(const void* elem)
|
||||
memcpy(data + r*step.p[0], elem, esz);
|
||||
size.p[0] = r + 1;
|
||||
dataend += step.p[0];
|
||||
if( esz < step.p[0] )
|
||||
uint64 tsz = size.p[0];
|
||||
for( int i = 1; i < dims; i++ )
|
||||
tsz *= size.p[i];
|
||||
if( esz < step.p[0] || tsz != (uint64)(int)tsz )
|
||||
flags &= ~CONTINUOUS_FLAG;
|
||||
}
|
||||
|
||||
@ -792,10 +787,7 @@ Mat& Mat::adjustROI( int dtop, int dbottom, int dleft, int dright )
|
||||
data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
|
||||
rows = row2 - row1; cols = col2 - col1;
|
||||
size.p[0] = rows; size.p[1] = cols;
|
||||
if( esz*cols == step[0] || rows == 1 )
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
else
|
||||
flags &= ~CONTINUOUS_FLAG;
|
||||
updateContinuityFlag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -120,8 +120,8 @@ static Mat iplImageToMat(const IplImage* img, bool copyData)
|
||||
}
|
||||
m.datalimit = m.datastart + m.step.p[0]*m.rows;
|
||||
m.dataend = m.datastart + m.step.p[0]*(m.rows-1) + esz*m.cols;
|
||||
m.flags |= (m.cols*esz == m.step.p[0] || m.rows == 1 ? Mat::CONTINUOUS_FLAG : 0);
|
||||
m.step[1] = esz;
|
||||
m.updateContinuityFlag();
|
||||
|
||||
if( copyData )
|
||||
{
|
||||
|
@ -5681,8 +5681,6 @@ namespace cv {
|
||||
// three funcs below are implemented in umatrix.cpp
|
||||
void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
|
||||
bool autoSteps = false );
|
||||
|
||||
void updateContinuityFlag(UMat& m);
|
||||
void finalizeHdr(UMat& m);
|
||||
|
||||
} // namespace cv
|
||||
|
@ -193,6 +193,7 @@ inline Size getContinuousSize( const Mat& m1, const Mat& m2,
|
||||
|
||||
void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false );
|
||||
void finalizeHdr(Mat& m);
|
||||
int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step);
|
||||
|
||||
struct NoVec
|
||||
{
|
||||
|
@ -318,32 +318,15 @@ void setSize( UMat& m, int _dims, const int* _sz,
|
||||
}
|
||||
|
||||
|
||||
void updateContinuityFlag(UMat& m)
|
||||
void UMat::updateContinuityFlag()
|
||||
{
|
||||
int i, j;
|
||||
for( i = 0; i < m.dims; i++ )
|
||||
{
|
||||
if( m.size[i] > 1 )
|
||||
break;
|
||||
}
|
||||
|
||||
for( j = m.dims-1; j > i; j-- )
|
||||
{
|
||||
if( m.step[j]*m.size[j] < m.step[j-1] )
|
||||
break;
|
||||
}
|
||||
|
||||
uint64 total = (uint64)m.step[0]*m.size[0];
|
||||
if( j <= i && total == (size_t)total )
|
||||
m.flags |= UMat::CONTINUOUS_FLAG;
|
||||
else
|
||||
m.flags &= ~UMat::CONTINUOUS_FLAG;
|
||||
flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
|
||||
}
|
||||
|
||||
|
||||
void finalizeHdr(UMat& m)
|
||||
{
|
||||
updateContinuityFlag(m);
|
||||
m.updateContinuityFlag();
|
||||
int d = m.dims;
|
||||
if( d > 2 )
|
||||
m.rows = m.cols = -1;
|
||||
@ -537,12 +520,10 @@ UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
|
||||
CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols );
|
||||
cols = _colRange.size();
|
||||
offset += _colRange.start*elemSize();
|
||||
flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
|
||||
if( rows == 1 )
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
updateContinuityFlag();
|
||||
|
||||
if( rows <= 0 || cols <= 0 )
|
||||
{
|
||||
@ -557,8 +538,6 @@ UMat::UMat(const UMat& m, const Rect& roi)
|
||||
allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
|
||||
{
|
||||
CV_Assert( m.dims <= 2 );
|
||||
flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
|
||||
flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
|
||||
|
||||
size_t esz = CV_ELEM_SIZE(flags);
|
||||
offset += roi.x*esz;
|
||||
@ -570,6 +549,7 @@ UMat::UMat(const UMat& m, const Rect& roi)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
|
||||
step[0] = m.step[0]; step[1] = esz;
|
||||
updateContinuityFlag();
|
||||
|
||||
if( rows <= 0 || cols <= 0 )
|
||||
{
|
||||
@ -601,7 +581,7 @@ UMat::UMat(const UMat& m, const Range* ranges)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
}
|
||||
updateContinuityFlag(*this);
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
|
||||
@ -626,7 +606,7 @@ UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
|
||||
flags |= SUBMATRIX_FLAG;
|
||||
}
|
||||
}
|
||||
updateContinuityFlag(*this);
|
||||
updateContinuityFlag();
|
||||
}
|
||||
|
||||
UMat UMat::diag(int d) const
|
||||
@ -652,10 +632,7 @@ UMat UMat::diag(int d) const
|
||||
m.size[1] = m.cols = 1;
|
||||
m.step[0] += (len > 1 ? esz : 0);
|
||||
|
||||
if( m.rows > 1 )
|
||||
m.flags &= ~CONTINUOUS_FLAG;
|
||||
else
|
||||
m.flags |= CONTINUOUS_FLAG;
|
||||
m.updateContinuityFlag();
|
||||
|
||||
if( size() != Size(1,1) )
|
||||
m.flags |= SUBMATRIX_FLAG;
|
||||
@ -701,10 +678,7 @@ UMat& UMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
|
||||
offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
|
||||
rows = row2 - row1; cols = col2 - col1;
|
||||
size.p[0] = rows; size.p[1] = cols;
|
||||
if( esz*cols == step[0] || rows == 1 )
|
||||
flags |= CONTINUOUS_FLAG;
|
||||
else
|
||||
flags &= ~CONTINUOUS_FLAG;
|
||||
updateContinuityFlag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -522,33 +522,23 @@ protected:
|
||||
|
||||
TEST(Core_InputOutput, misc) { CV_MiscIOTest test; test.safe_run(); }
|
||||
|
||||
/*class CV_BigMatrixIOTest : public cvtest::BaseTest
|
||||
#if 0 // 4+ GB of data, 40+ GB of estimated result size, it is very slow
|
||||
BIGDATA_TEST(Core_InputOutput, huge)
|
||||
{
|
||||
public:
|
||||
CV_BigMatrixIOTest() {}
|
||||
~CV_BigMatrixIOTest() {}
|
||||
protected:
|
||||
void run(int)
|
||||
RNG& rng = theRNG();
|
||||
int N = 1000, M = 1200000;
|
||||
std::cout << "Allocating..." << std::endl;
|
||||
Mat mat(M, N, CV_32F);
|
||||
std::cout << "Initializing..." << std::endl;
|
||||
rng.fill(mat, RNG::UNIFORM, 0, 1);
|
||||
std::cout << "Writing..." << std::endl;
|
||||
{
|
||||
try
|
||||
{
|
||||
RNG& rng = theRNG();
|
||||
int N = 1000, M = 1200000;
|
||||
Mat mat(M, N, CV_32F);
|
||||
rng.fill(mat, RNG::UNIFORM, 0, 1);
|
||||
FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
|
||||
fs << "mat" << mat;
|
||||
fs.release();
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
|
||||
}
|
||||
FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
|
||||
fs << "mat" << mat;
|
||||
fs.release();
|
||||
}
|
||||
};
|
||||
|
||||
TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(Core_globbing, accuracy)
|
||||
{
|
||||
|
@ -1766,4 +1766,26 @@ TEST(Mat_, template_based_ptr)
|
||||
ASSERT_FLOAT_EQ(66.0f, *(mat.ptr<float>(idx)));
|
||||
}
|
||||
|
||||
|
||||
BIGDATA_TEST(Mat, push_back_regression_4158) // memory usage: ~10.6 Gb
|
||||
{
|
||||
Mat result;
|
||||
|
||||
Mat tail(100, 500000, CV_32FC2, Scalar(1, 2));
|
||||
|
||||
tail.copyTo(result);
|
||||
for (int i = 1; i < 15; i++)
|
||||
{
|
||||
result.push_back(tail);
|
||||
std::cout << "i = " << i << " result = " << result.size() << " used = " << (uint64)result.total()*result.elemSize()*(1.0 / (1 << 20)) << " Mb"
|
||||
<< " allocated=" << (uint64)(result.datalimit - result.datastart)*(1.0 / (1 << 20)) << " Mb" << std::endl;
|
||||
}
|
||||
for (int i = 0; i < 15; i++)
|
||||
{
|
||||
Rect roi(0, tail.rows * i, tail.cols, tail.rows);
|
||||
int nz = countNonZero(result(roi).reshape(1) == 2);
|
||||
EXPECT_EQ(tail.total(), (size_t)nz) << "i=" << i;
|
||||
}
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
|
@ -137,12 +137,11 @@ void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
|
||||
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
|
||||
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
|
||||
|
||||
GpuMat src = getInputMat(_src, stream);
|
||||
const GpuMat src = getInputMat(_src, stream);
|
||||
|
||||
CV_Assert( src.type() == CV_8UC1 );
|
||||
|
||||
_dst.create(1, 2, CV_64FC1);
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
|
||||
|
||||
NppiSize sz;
|
||||
sz.width = src.cols;
|
||||
|
@ -826,6 +826,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
||||
CV_OUT std::vector<int>& indices,
|
||||
const float eta = 1.f, const int top_k = 0);
|
||||
|
||||
CV_EXPORTS void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
|
||||
const float score_threshold, const float nms_threshold,
|
||||
CV_OUT std::vector<int>& indices,
|
||||
const float eta = 1.f, const int top_k = 0);
|
||||
|
||||
//! @}
|
||||
CV__DNN_EXPERIMENTAL_NS_END
|
||||
|
@ -121,7 +121,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, ENet)
|
||||
{
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
|
||||
if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
|
||||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
|
||||
throw SkipTestException("");
|
||||
processNet("dnn/Enet-model-best.net", "", "enet.yml",
|
||||
Mat(cv::Size(512, 256), CV_32FC3));
|
||||
}
|
||||
@ -232,7 +234,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
|
||||
#endif
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
|
||||
|
@ -62,6 +62,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
||||
// this option is useful to run valgrind memory errors detection
|
||||
static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
|
||||
#endif
|
||||
|
||||
using std::vector;
|
||||
using std::map;
|
||||
using std::make_pair;
|
||||
@ -497,7 +501,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate)
|
||||
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate, bool use_half)
|
||||
{
|
||||
if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate)
|
||||
{
|
||||
@ -538,14 +542,14 @@ public:
|
||||
{
|
||||
// if dst already has been allocated with total(shape) elements,
|
||||
// it won't be recrreated and pointer of dst.data remains the same.
|
||||
dst.create(shape, CV_32F);
|
||||
dst.create(shape, use_half ? CV_16S : CV_32F);
|
||||
addHost(lp, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
|
||||
std::vector<LayerPin>& pinsForInternalBlobs,
|
||||
bool forceCreate = false)
|
||||
bool forceCreate = false, bool use_half = false)
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
@ -616,7 +620,7 @@ public:
|
||||
reuse(ld.inputBlobsId[0], blobPin);
|
||||
}
|
||||
else
|
||||
reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate);
|
||||
reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate, use_half);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -654,7 +658,7 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
|
||||
{
|
||||
if (targetId == DNN_TARGET_CPU)
|
||||
return Ptr<BackendWrapper>();
|
||||
else if (targetId == DNN_TARGET_OPENCL)
|
||||
else if (IS_DNN_OPENCL_TARGET(targetId))
|
||||
return OpenCLBackendWrapper::create(m);
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
|
||||
@ -719,6 +723,7 @@ struct Net::Impl
|
||||
bool netWasAllocated;
|
||||
bool fusion;
|
||||
std::vector<int64> layersTimings;
|
||||
Mat output_blob;
|
||||
|
||||
Ptr<BackendWrapper> wrap(Mat& host)
|
||||
{
|
||||
@ -735,7 +740,7 @@ struct Net::Impl
|
||||
Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT)
|
||||
{
|
||||
CV_Assert(preferableTarget == DNN_TARGET_OPENCL);
|
||||
CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
|
||||
return OpenCLBackendWrapper::create(baseBuffer, host);
|
||||
}
|
||||
else if (preferableBackend == DNN_BACKEND_HALIDE)
|
||||
@ -847,12 +852,22 @@ struct Net::Impl
|
||||
|
||||
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
|
||||
{
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
|
||||
#ifndef HAVE_OPENCL
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.")
|
||||
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
|
||||
preferableTarget = DNN_TARGET_CPU;
|
||||
}
|
||||
#else
|
||||
{
|
||||
if (!DNN_OPENCL_ALLOW_ALL_DEVICES
|
||||
&& !(ocl::Device::getDefault().isIntel() && ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU) // Current implementation is only valid for Intel GPU (#11494)
|
||||
)
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with Intel GPUs only), switching to CPU.");
|
||||
preferableTarget = DNN_TARGET_CPU;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
clear();
|
||||
|
||||
@ -1022,7 +1037,7 @@ struct Net::Impl
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT)
|
||||
CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
|
||||
CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
|
||||
else if (preferableBackend == DNN_BACKEND_HALIDE)
|
||||
initHalideBackend();
|
||||
else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
|
||||
@ -1357,7 +1372,9 @@ struct Net::Impl
|
||||
|
||||
std::vector<LayerPin> pinsForInternalBlobs;
|
||||
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
|
||||
preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
|
||||
preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
|
||||
preferableBackend == DNN_BACKEND_DEFAULT &&
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16);
|
||||
ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
|
||||
for (int i = 0; i < ld.outputBlobs.size(); ++i)
|
||||
{
|
||||
@ -1427,7 +1444,7 @@ struct Net::Impl
|
||||
// some other layers.
|
||||
|
||||
// TODO: OpenCL target support more fusion styles.
|
||||
if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL &&
|
||||
if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
|
||||
ld.layerInstance->type != "MVN")) )
|
||||
continue;
|
||||
@ -1466,8 +1483,8 @@ struct Net::Impl
|
||||
continue; // Go to the next layer.
|
||||
|
||||
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
|
||||
if ( preferableTarget != DNN_TARGET_OPENCL ||
|
||||
(preferableTarget == DNN_TARGET_OPENCL &&
|
||||
if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
|
||||
(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
nextData &&
|
||||
((nextData->type == "ReLU") ||
|
||||
(nextData->type == "ChannelsPReLU") ||
|
||||
@ -1490,7 +1507,7 @@ struct Net::Impl
|
||||
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
||||
ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
|
||||
|
||||
if ( preferableTarget == DNN_TARGET_OPENCL )
|
||||
if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
|
||||
{
|
||||
if ( !activData->consumers.empty() )
|
||||
{
|
||||
@ -1502,7 +1519,7 @@ struct Net::Impl
|
||||
}
|
||||
|
||||
// fuse convlution layer followed by eltwise + relu
|
||||
if ( preferableTarget == DNN_TARGET_OPENCL )
|
||||
if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
|
||||
{
|
||||
Ptr<EltwiseLayer> nextEltwiseLayer;
|
||||
if( nextData )
|
||||
@ -1715,6 +1732,13 @@ struct Net::Impl
|
||||
for(int i = 0; i < layers[0].outputBlobs.size(); i++)
|
||||
{
|
||||
CV_Assert(layers[0].outputBlobs[i].total());
|
||||
if (layers[0].outputBlobs[i].depth() == CV_32F &&
|
||||
preferableBackend == DNN_BACKEND_DEFAULT &&
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
Mat mat = layers[0].outputBlobs[i].clone();
|
||||
convertFp16(mat, layers[0].outputBlobs[i]);
|
||||
}
|
||||
inputShapes.push_back(shape(layers[0].outputBlobs[i]));
|
||||
}
|
||||
LayersShapesMap layersShapes;
|
||||
@ -1760,7 +1784,7 @@ struct Net::Impl
|
||||
{
|
||||
if( !ld.skip )
|
||||
{
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
|
||||
if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
|
||||
{
|
||||
std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
|
||||
@ -1925,7 +1949,14 @@ struct Net::Impl
|
||||
// Transfer data to CPU if it's require.
|
||||
ld.outputBlobsWrappers[pin.oid]->copyToHost();
|
||||
}
|
||||
return ld.outputBlobs[pin.oid];
|
||||
|
||||
if (ld.outputBlobs[pin.oid].depth() == CV_16S)
|
||||
{
|
||||
convertFp16(ld.outputBlobs[pin.oid], output_blob);
|
||||
return output_blob;
|
||||
}
|
||||
else
|
||||
return ld.outputBlobs[pin.oid];
|
||||
}
|
||||
|
||||
Mat getBlob(String outputName)
|
||||
@ -2068,7 +2099,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
|
||||
|
||||
if (outputBlobs.isUMat())
|
||||
{
|
||||
outputBlobs.assign(ld.outputBlobs[pin.oid].getUMat(ACCESS_RW));
|
||||
outputBlobs.assign(impl->getBlob(layerName).getUMat(ACCESS_RW));
|
||||
}
|
||||
else if (outputBlobs.isMat())
|
||||
{
|
||||
@ -2084,17 +2115,33 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
|
||||
ld.outputBlobsWrappers[i]->copyToHost();
|
||||
}
|
||||
}
|
||||
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
|
||||
outputvec = ld.outputBlobs;
|
||||
if (ld.outputBlobs[0].depth() == CV_32F)
|
||||
{
|
||||
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
|
||||
outputvec = ld.outputBlobs;
|
||||
} else {
|
||||
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
|
||||
outputvec.resize(ld.outputBlobs.size());
|
||||
for (int i = 0; i < outputvec.size(); i++)
|
||||
convertFp16(ld.outputBlobs[i], outputvec[i]);
|
||||
}
|
||||
}
|
||||
else if (outputBlobs.isUMatVector())
|
||||
{
|
||||
std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
|
||||
|
||||
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
|
||||
impl->preferableTarget == DNN_TARGET_OPENCL)
|
||||
IS_DNN_OPENCL_TARGET(impl->preferableTarget))
|
||||
{
|
||||
outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
if (impl->preferableTarget == DNN_TARGET_OPENCL)
|
||||
outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
|
||||
outputvec.resize(out_vec.size());
|
||||
for (int i = 0; i < out_vec.size(); i++)
|
||||
convertFp16(out_vec[i], outputvec[i]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -2182,6 +2229,16 @@ void Net::setPreferableTarget(int targetId)
|
||||
if( impl->preferableTarget != targetId )
|
||||
{
|
||||
impl->preferableTarget = targetId;
|
||||
if (IS_DNN_OPENCL_TARGET(targetId))
|
||||
{
|
||||
#ifndef HAVE_OPENCL
|
||||
impl->preferableTarget = DNN_TARGET_CPU;
|
||||
#else
|
||||
bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
|
||||
if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
|
||||
impl->preferableTarget = DNN_TARGET_OPENCL;
|
||||
#endif
|
||||
}
|
||||
impl->netWasAllocated = false;
|
||||
impl->clear();
|
||||
}
|
||||
@ -2210,7 +2267,17 @@ void Net::setInput(InputArray blob, const String& name)
|
||||
ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
|
||||
ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
|
||||
MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
|
||||
Mat blob_ = blob.getMat();
|
||||
Mat blob_;
|
||||
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
|
||||
impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
Mat blob_mat = blob.getMat();
|
||||
convertFp16(blob_mat, blob_);
|
||||
}
|
||||
else
|
||||
{
|
||||
blob_ = blob.getMat();
|
||||
}
|
||||
bool oldShape = prevShape == shape(blob_);
|
||||
if (oldShape)
|
||||
{
|
||||
@ -2735,6 +2802,43 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
std::vector<UMat> orig_inputs;
|
||||
std::vector<UMat> orig_outputs;
|
||||
std::vector<UMat> orig_internals;
|
||||
|
||||
inputs_arr.getUMatVector(orig_inputs);
|
||||
outputs_arr.getUMatVector(orig_outputs);
|
||||
internals_arr.getUMatVector(orig_internals);
|
||||
|
||||
inputs.resize(orig_inputs.size());
|
||||
for (size_t i = 0; i < orig_inputs.size(); i++)
|
||||
convertFp16(orig_inputs[i], inputs[i]);
|
||||
|
||||
outputs.resize(orig_outputs.size());
|
||||
for (size_t i = 0; i < orig_outputs.size(); i++)
|
||||
outputs[i].create(shape(orig_outputs[i]), CV_32F);
|
||||
|
||||
internals.resize(orig_internals.size());
|
||||
for (size_t i = 0; i < orig_internals.size(); i++)
|
||||
internals[i].create(shape(orig_internals[i]), CV_32F);
|
||||
|
||||
forward(inputs, outputs, internals);
|
||||
|
||||
for (size_t i = 0; i < outputs.size(); i++)
|
||||
convertFp16(outputs[i], orig_outputs[i]);
|
||||
|
||||
// sync results back
|
||||
outputs_arr.assign(orig_outputs);
|
||||
internals_arr.assign(orig_internals);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Mat> inpvec;
|
||||
std::vector<Mat> outputs;
|
||||
std::vector<Mat> internals;
|
||||
|
@ -120,12 +120,16 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
CV_Assert(blobs.size() >= 2);
|
||||
CV_Assert(inputs.size() == 1);
|
||||
|
||||
if (use_half && inputs[0].dims == 2)
|
||||
return false;
|
||||
|
||||
if (umat_weight.empty())
|
||||
{
|
||||
umat_weight = weights_.getUMat(ACCESS_READ);
|
||||
@ -139,6 +143,7 @@ public:
|
||||
int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
|
||||
int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
|
||||
|
||||
String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
|
||||
for (size_t ii = 0; ii < outputs.size(); ii++)
|
||||
{
|
||||
if (inpBlob.dims == 2)
|
||||
@ -154,8 +159,12 @@ public:
|
||||
UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
|
||||
UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
|
||||
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
|
||||
String buildopt = format("-DNUM=%d", number);
|
||||
String buildopt = format("-DNUM=%d", number) + opts;
|
||||
String kname = format("batch_norm%d", number);
|
||||
if (number == 1)
|
||||
buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
|
||||
else
|
||||
buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
|
||||
ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
|
||||
if (kernel.empty())
|
||||
return false;
|
||||
@ -181,7 +190,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -95,7 +95,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -128,14 +128,14 @@ public:
|
||||
for( i = 0; i < ninputs; i++ )
|
||||
{
|
||||
Mat& inp = *inputs[i];
|
||||
CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
|
||||
CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
|
||||
inp.dims == 4 && inp.size[0] == output.size[0] &&
|
||||
inp.size[2] == output.size[2] &&
|
||||
inp.size[3] == output.size[3] );
|
||||
nchannels += inp.size[1];
|
||||
}
|
||||
CV_Assert( nchannels == output.size[1] );
|
||||
CV_Assert( output.isContinuous() && output.type() == CV_32F );
|
||||
CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
|
||||
|
||||
cc.chptrs.resize(nchannels*batchsz);
|
||||
|
||||
@ -186,6 +186,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -199,11 +200,12 @@ public:
|
||||
int num_concats = total(shape(inputs[0]), 0, cAxis);
|
||||
int offset_concat_axis = 0;
|
||||
UMat& outMat = outputs[0];
|
||||
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
|
||||
String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
|
||||
String kname = format("concat_%s", use_half ? "half" : "float");
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt);
|
||||
ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
|
||||
if (kernel.empty())
|
||||
return false;
|
||||
|
||||
@ -235,7 +237,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -94,7 +94,7 @@ public:
|
||||
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
|
||||
|
||||
const Mat &input = *inputs[0];
|
||||
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
|
||||
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
CV_Assert(inputs[i]->type() == input.type());
|
||||
@ -288,7 +288,7 @@ public:
|
||||
newActiv = true;
|
||||
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
|
||||
|
||||
if (preferableTarget == DNN_TARGET_OPENCL)
|
||||
if (IS_DNN_OPENCL_TARGET(preferableTarget))
|
||||
{
|
||||
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
|
||||
if (!activ_power.empty())
|
||||
@ -842,6 +842,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -860,6 +861,7 @@ public:
|
||||
config.dilation = dilation;
|
||||
config.group = inputs[0].size[1] / umat_blobs[0].size[1];
|
||||
config.bias_term = (hasBias()) ? true : false;
|
||||
config.use_half = use_half;
|
||||
|
||||
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
|
||||
}
|
||||
@ -964,7 +966,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
@ -1360,6 +1362,9 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -1450,7 +1455,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -307,8 +307,24 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
if (use_half)
|
||||
{
|
||||
std::vector<UMat> orig_inputs;
|
||||
std::vector<UMat> orig_outputs;
|
||||
|
||||
inps.getUMatVector(orig_inputs);
|
||||
outs.getUMatVector(orig_outputs);
|
||||
|
||||
inputs.resize(orig_inputs.size());
|
||||
for (size_t i = 0; i < orig_inputs.size(); i++)
|
||||
convertFp16(orig_inputs[i], inputs[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
}
|
||||
|
||||
std::vector<LabelBBox> allDecodedBBoxes;
|
||||
std::vector<Mat> allConfidenceScores;
|
||||
@ -342,7 +358,13 @@ public:
|
||||
{
|
||||
// Set confidences to zeros.
|
||||
Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
|
||||
outputs[0](ranges).setTo(0);
|
||||
if (use_half)
|
||||
{
|
||||
std::vector<UMat> orig_outputs;
|
||||
outs.getUMatVector(orig_outputs);
|
||||
orig_outputs[0](ranges).setTo(0);
|
||||
} else
|
||||
outputs[0](ranges).setTo(0);
|
||||
return true;
|
||||
}
|
||||
int outputShape[] = {1, 1, (int)numKept, 7};
|
||||
@ -360,9 +382,23 @@ public:
|
||||
}
|
||||
CV_Assert(count == numKept);
|
||||
}
|
||||
outputs.clear();
|
||||
outputs.push_back(umat);
|
||||
outs.assign(outputs);
|
||||
|
||||
if (use_half)
|
||||
{
|
||||
UMat half_umat;
|
||||
convertFp16(umat, half_umat);
|
||||
|
||||
std::vector<UMat> orig_outputs;
|
||||
outs.getUMatVector(orig_outputs);
|
||||
orig_outputs.clear();
|
||||
orig_outputs.push_back(half_umat);
|
||||
outs.assign(orig_outputs);
|
||||
} else {
|
||||
outputs.clear();
|
||||
outputs.push_back(umat);
|
||||
outs.assign(outputs);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
@ -372,7 +408,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -176,7 +176,7 @@ public:
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
func.applyOCL(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
@ -223,7 +223,12 @@ public:
|
||||
#ifdef HAVE_OPENCL
|
||||
static String oclGetTMacro(const UMat &m)
|
||||
{
|
||||
return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
|
||||
String str_name = ocl::typeToStr(m.type());
|
||||
|
||||
if (str_name == "short")
|
||||
str_name = "half";
|
||||
|
||||
return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -516,8 +521,28 @@ struct SigmoidFunctor
|
||||
#ifdef HAVE_OPENCL
|
||||
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
|
||||
{
|
||||
// TODO: implement OCL version
|
||||
return false;
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
String buildopt = oclGetTMacro(inputs[0]);
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
UMat& src = inputs[i];
|
||||
UMat& dst = outputs[i];
|
||||
|
||||
ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
|
||||
kernel.set(0, (int)src.total());
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
|
||||
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
|
||||
size_t gSize = src.total();
|
||||
CV_Assert(kernel.run(1, &gSize, NULL, false));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -561,8 +586,28 @@ struct ELUFunctor
|
||||
#ifdef HAVE_OPENCL
|
||||
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
|
||||
{
|
||||
// TODO: implement OCL version
|
||||
return false;
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
String buildopt = oclGetTMacro(inputs[0]);
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
UMat& src = inputs[i];
|
||||
UMat& dst = outputs[i];
|
||||
|
||||
ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
|
||||
kernel.set(0, (int)src.total());
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
|
||||
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
|
||||
size_t gSize = src.total();
|
||||
CV_Assert(kernel.run(1, &gSize, NULL, false));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -604,8 +649,28 @@ struct AbsValFunctor
|
||||
#ifdef HAVE_OPENCL
|
||||
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
|
||||
{
|
||||
// TODO: implement OCL version
|
||||
return false;
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
String buildopt = oclGetTMacro(inputs[0]);
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
UMat& src = inputs[i];
|
||||
UMat& dst = outputs[i];
|
||||
|
||||
ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
|
||||
kernel.set(0, (int)src.total());
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
|
||||
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
|
||||
size_t gSize = src.total();
|
||||
CV_Assert(kernel.run(1, &gSize, NULL, false));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -271,6 +271,9 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
if (inputs_.depth() == CV_16S && op != SUM)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
@ -284,10 +287,15 @@ public:
|
||||
{
|
||||
size_t localsize[] = { 128 };
|
||||
size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
|
||||
String opts;
|
||||
if (inputs_.depth() == CV_16S)
|
||||
opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
|
||||
else
|
||||
opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
|
||||
|
||||
for (int i = 0; i < (inputs.size() - 1); ++i)
|
||||
{
|
||||
String buildopt = format("-DLOOP=%d", i);
|
||||
String buildopt = format("-DLOOP=%d", i) + opts;
|
||||
ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
|
||||
int idx = 0;
|
||||
UMat inpMat = (i == 0) ? inputs[0] : UMat();
|
||||
@ -306,6 +314,9 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
|
||||
float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
|
||||
UMat mul0, mul1;
|
||||
@ -343,7 +354,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -140,7 +140,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
outputs_arr.isUMatVector() &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
@ -64,6 +64,7 @@ public:
|
||||
#ifdef HAVE_OPENCL
|
||||
Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
|
||||
std::vector<UMat> umat_blobs;
|
||||
std::vector<UMat> half_blobs;
|
||||
#endif
|
||||
|
||||
FullyConnectedLayerImpl(const LayerParams& params)
|
||||
@ -277,6 +278,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -293,6 +295,17 @@ public:
|
||||
config.bias_term = bias;
|
||||
config.M = outerSize;
|
||||
config.K = innerSize;
|
||||
config.use_half = use_half;
|
||||
|
||||
if (use_half)
|
||||
{
|
||||
half_blobs.resize(umat_blobs.size());
|
||||
for (int i = 0; i < umat_blobs.size(); i++)
|
||||
{
|
||||
if (!umat_blobs[i].empty())
|
||||
convertFp16(umat_blobs[i], half_blobs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
|
||||
}
|
||||
@ -309,13 +322,15 @@ public:
|
||||
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
|
||||
dstMat.setTo(0.0f);
|
||||
|
||||
if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
|
||||
if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
|
||||
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
|
||||
dstMat))
|
||||
{
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (bias && (outerSize > 1))
|
||||
if (!use_half && bias && (outerSize > 1))
|
||||
{
|
||||
UMat& biases = umat_blobs[1];
|
||||
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
|
||||
@ -353,7 +368,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -106,6 +106,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -128,6 +129,7 @@ public:
|
||||
config.height = inputs[0].size[2];
|
||||
config.width = inputs[0].size[3];
|
||||
config.norm_by_size = normBySize;
|
||||
config.use_half = use_half;
|
||||
|
||||
lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
|
||||
}
|
||||
@ -146,7 +148,7 @@ public:
|
||||
|
||||
CV_Assert(inputs_arr.total() == outputs_arr.total());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -102,6 +102,9 @@ public:
|
||||
{
|
||||
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
|
||||
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
|
||||
bool use_half = (inputs[0].depth() == CV_16S);
|
||||
String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
|
||||
use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
|
||||
|
||||
int splitDim = (acrossChannels) ? 1 : 2;
|
||||
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
|
||||
@ -111,12 +114,11 @@ public:
|
||||
int newRows = total(shape(inpMat), 0, splitDim);
|
||||
|
||||
MatShape s = shape(newRows, inpMat.total() / newRows);
|
||||
UMat oneMat = UMat::ones(s[1], 1, CV_32F);
|
||||
UMat meanMat = UMat(s[0], 1, CV_32F);
|
||||
UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
|
||||
UMat tmpMat = UMat(s[0], s[1], CV_32F);
|
||||
float alpha = 1.0f / s[1];
|
||||
|
||||
String buildopt = "-DNUM=4";
|
||||
String buildopt = "-DNUM=4" + opts;
|
||||
ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
|
||||
size_t localsize[] = { 128 };
|
||||
size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
|
||||
@ -167,13 +169,14 @@ public:
|
||||
int row_size = total(shape(inputs[0]), 0, splitDim);
|
||||
int plane_size = total(shape(inputs[0]), splitDim);
|
||||
if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
|
||||
{
|
||||
bool ret = fast_forward_ocl(inputs, outputs);
|
||||
return ret;
|
||||
}
|
||||
return fast_forward_ocl(inputs, outputs);
|
||||
|
||||
if (inputs[0].depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
|
||||
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
|
||||
String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
|
||||
|
||||
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
|
||||
{
|
||||
@ -195,7 +198,7 @@ public:
|
||||
|
||||
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
|
||||
size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
|
||||
String buildopt = format("-DNUM=%d", number);
|
||||
String buildopt = format("-DNUM=%d", number) + opts;
|
||||
if (normVariance)
|
||||
{
|
||||
String kname = format("calc_mean%d", number);
|
||||
@ -249,7 +252,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -87,6 +87,9 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -162,7 +165,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -288,9 +288,11 @@ public:
|
||||
if (!_needsPermute)
|
||||
return false;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
String opts = format("-DDtype=%s", use_half ? "half" : "float");
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
|
||||
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
|
||||
|
||||
kernel.set(0, (int)_count);
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
|
||||
@ -313,7 +315,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -147,6 +147,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -164,6 +165,7 @@ public:
|
||||
(type == AVE ? LIBDNN_POOLING_METHOD_AVE :
|
||||
LIBDNN_POOLING_METHOD_STO);
|
||||
config.avePoolPaddedArea = avePoolPaddedArea;
|
||||
config.use_half = use_half;
|
||||
poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
|
||||
}
|
||||
|
||||
@ -189,7 +191,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -316,6 +316,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -340,9 +341,15 @@ public:
|
||||
heights.copyTo(umat_heights);
|
||||
}
|
||||
|
||||
size_t nthreads = _layerHeight * _layerWidth;
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
|
||||
else
|
||||
opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
|
||||
|
||||
size_t nthreads = _layerHeight * _layerWidth;
|
||||
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
|
||||
|
||||
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
|
||||
kernel.set(0, (int)nthreads);
|
||||
kernel.set(1, (float)_stepX);
|
||||
kernel.set(2, (float)_stepY);
|
||||
@ -375,7 +382,7 @@ public:
|
||||
|
||||
// set the variance.
|
||||
{
|
||||
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
|
||||
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
|
||||
int offset = total(shape(outputs[0]), 2);
|
||||
size_t nthreads = _layerHeight * _layerWidth * _numPriors;
|
||||
kernel.set(0, (int)nthreads);
|
||||
@ -395,7 +402,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -158,6 +158,9 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -237,7 +240,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -127,7 +127,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
// TODO: implement a logistic activation to classification scores.
|
||||
if (useLogistic)
|
||||
if (useLogistic || inps.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
@ -191,7 +191,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -96,9 +96,10 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
|
||||
String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
@ -134,7 +135,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -219,7 +219,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -181,6 +181,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
@ -188,6 +189,11 @@ public:
|
||||
(total(shape(outputs[0]), 2) % 4 != 0))
|
||||
return false;
|
||||
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
|
||||
else
|
||||
opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
|
||||
const UMat& inpMat = inputs[0];
|
||||
for (size_t i = 0; i < outputs.size(); i++)
|
||||
{
|
||||
@ -196,7 +202,7 @@ public:
|
||||
int rows = outputs[i].size[2];
|
||||
int cols = outputs[i].size[3];
|
||||
|
||||
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc);
|
||||
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
|
||||
size_t local[] = { 128 };
|
||||
size_t global[] = { (size_t)groups * channels / 4 * local[0] };
|
||||
int idx = 0;
|
||||
@ -222,7 +228,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -99,15 +99,16 @@ public:
|
||||
softmaxOp.release();
|
||||
}
|
||||
|
||||
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns)
|
||||
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
|
||||
{
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
itns.getUMatVector(internals);
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
|
||||
if (softmaxOp.empty())
|
||||
{
|
||||
@ -117,6 +118,7 @@ public:
|
||||
config.axis = axisRaw;
|
||||
config.channels = inputs[0].size[axisRaw];
|
||||
config.logsoftmax = logSoftMax;
|
||||
config.use_half = use_half;
|
||||
|
||||
softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
|
||||
}
|
||||
@ -128,15 +130,13 @@ public:
|
||||
return true;
|
||||
|
||||
UMat& bufMat = internals[0];
|
||||
src.copyTo(dstMat);
|
||||
|
||||
int axis = clamp(axisRaw, src.dims);
|
||||
MatShape s = shape(src);
|
||||
size_t outerSize = total(s, 0, axis);
|
||||
size_t channels = src.size[axis];
|
||||
size_t innerSize = total(s, axis + 1);
|
||||
|
||||
String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
|
||||
String buildOpts = format("-DT=%s", use_half ? "half" : "float");
|
||||
ocl::Kernel kmax, ksub, ksum, kdiv;
|
||||
|
||||
if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
|
||||
@ -152,38 +152,31 @@ public:
|
||||
if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
|
||||
return false;
|
||||
|
||||
size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
|
||||
size_t bufSize = internals[0].total();
|
||||
size_t totalSize = src.total();
|
||||
|
||||
// adjust local/global size
|
||||
size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize };
|
||||
size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
|
||||
|
||||
// adjust local/global size (total)
|
||||
size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
|
||||
size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
|
||||
size_t internal_globalSize[1] = { bufSize };
|
||||
size_t total_globalSize[1] = { totalSize };
|
||||
|
||||
kmax.args((int)outerSize, (int)channels, (int)innerSize,
|
||||
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
|
||||
if (!kmax.run(1, internal_globalSize, internal_localSize, false))
|
||||
ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
|
||||
if (!kmax.run(1, internal_globalSize, NULL, false))
|
||||
return false;
|
||||
|
||||
ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
|
||||
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
|
||||
if (!ksub.run(1, total_globalSize, total_localSize, false))
|
||||
ocl::KernelArg::PtrReadOnly(bufMat),
|
||||
ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
|
||||
if (!ksub.run(1, total_globalSize, NULL, false))
|
||||
return false;
|
||||
|
||||
cv::exp(dstMat, dstMat);
|
||||
|
||||
ksum.args((int)outerSize, (int)channels, (int)innerSize,
|
||||
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
|
||||
if (!ksum.run(1, internal_globalSize, internal_localSize, false))
|
||||
if (!ksum.run(1, internal_globalSize, NULL, false))
|
||||
return false;
|
||||
|
||||
kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
|
||||
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
|
||||
if (!kdiv.run(1, total_globalSize, total_localSize, false))
|
||||
if (!kdiv.run(1, total_globalSize, NULL, false))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@ -195,7 +188,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -8,6 +8,8 @@
|
||||
#include "precomp.hpp"
|
||||
#include "nms.inl.hpp"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace dnn
|
||||
@ -28,6 +30,27 @@ void NMSBoxes(const std::vector<Rect>& bboxes, const std::vector<float>& scores,
|
||||
NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap);
|
||||
}
|
||||
|
||||
static inline float rotatedRectIOU(const RotatedRect& a, const RotatedRect& b)
|
||||
{
|
||||
std::vector<Point2f> inter;
|
||||
int res = rotatedRectangleIntersection(a, b, inter);
|
||||
if (inter.empty() || res == INTERSECT_NONE)
|
||||
return 0.0f;
|
||||
if (res == INTERSECT_FULL)
|
||||
return 1.0f;
|
||||
float interArea = contourArea(inter);
|
||||
return interArea / (a.size.area() + b.size.area() - interArea);
|
||||
}
|
||||
|
||||
void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
|
||||
const float score_threshold, const float nms_threshold,
|
||||
std::vector<int>& indices, const float eta, const int top_k)
|
||||
{
|
||||
CV_Assert(bboxes.size() == scores.size(), score_threshold >= 0,
|
||||
nms_threshold >= 0, eta > 0);
|
||||
NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rotatedRectIOU);
|
||||
}
|
||||
|
||||
CV__DNN_EXPERIMENTAL_NS_END
|
||||
}// dnn
|
||||
}// cv
|
||||
|
@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
|
||||
stride(1, 1),
|
||||
dilation(1, 1),
|
||||
group(1),
|
||||
bias_term(false)
|
||||
bias_term(false),
|
||||
use_half(false)
|
||||
{}
|
||||
MatShape in_shape;
|
||||
MatShape out_shape;
|
||||
@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
|
||||
Size dilation;
|
||||
int group; // = 1;
|
||||
bool bias_term; // = false;
|
||||
bool use_half; // = false;
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
|
||||
int32_t group_;
|
||||
bool bias_term_;
|
||||
UMat swizzled_weights_umat;
|
||||
UMat weights_half;
|
||||
UMat bias_half;
|
||||
UMat bottom_data2_;
|
||||
|
||||
int32_t bottom_index_;
|
||||
@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
|
||||
ocl4dnnFusedActiv_t fused_activ_;
|
||||
float power_;
|
||||
bool fused_eltwise_;
|
||||
bool use_half_;
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
|
||||
channels(0),
|
||||
pool_method(LIBDNN_POOLING_METHOD_MAX),
|
||||
global_pooling(false),
|
||||
avePoolPaddedArea(false)
|
||||
avePoolPaddedArea(true),
|
||||
use_half(false)
|
||||
{}
|
||||
MatShape in_shape;
|
||||
MatShape out_shape;
|
||||
@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
|
||||
ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
|
||||
bool global_pooling; // = false;
|
||||
bool avePoolPaddedArea;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
template<typename Dtype>
|
||||
@ -391,13 +398,14 @@ class OCL4DNNPool
|
||||
int32_t pooled_height_;
|
||||
int32_t pooled_width_;
|
||||
bool avePoolPaddedArea;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
struct OCL4DNNInnerProductConfig
|
||||
{
|
||||
OCL4DNNInnerProductConfig() :
|
||||
num_output(0), M(0), K(0),
|
||||
bias_term(false), transpose(false), phase_test(true)
|
||||
bias_term(false), transpose(false), phase_test(true), use_half(false)
|
||||
{}
|
||||
int num_output;
|
||||
int M;
|
||||
@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
|
||||
bool bias_term;
|
||||
bool transpose; // = false;
|
||||
bool phase_test; // = true;
|
||||
bool use_half; // = false;
|
||||
};
|
||||
|
||||
template<typename Dtype>
|
||||
@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
|
||||
bool transpose_;
|
||||
bool image_copied_;
|
||||
bool phase_test_;
|
||||
bool use_half_;
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
|
||||
lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
|
||||
phase_test(true),
|
||||
local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
|
||||
batch_size(0), channels(0), height(0), width(0)
|
||||
batch_size(0), channels(0), height(0), width(0), use_half(false)
|
||||
{}
|
||||
MatShape in_shape;
|
||||
LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
|
||||
@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
|
||||
int32_t channels;
|
||||
int32_t height;
|
||||
int32_t width;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
template<typename Dtype>
|
||||
@ -477,16 +488,18 @@ class OCL4DNNLRN
|
||||
int32_t height_;
|
||||
int32_t width_;
|
||||
bool norm_by_size_;
|
||||
bool use_half_;
|
||||
};
|
||||
|
||||
struct OCL4DNNSoftmaxConfig
|
||||
{
|
||||
OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false)
|
||||
OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
|
||||
{}
|
||||
MatShape in_shape;
|
||||
int axis;
|
||||
int channels;
|
||||
bool logsoftmax;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
template<typename Dtype>
|
||||
@ -506,6 +519,7 @@ class OCL4DNNSoftmax
|
||||
bool use_slm_;
|
||||
bool log_softmax_;
|
||||
UMat scale_data_;
|
||||
bool use_half_;
|
||||
};
|
||||
|
||||
}}} // namespace cv::dnn::ocl4dnn
|
||||
|
@ -48,6 +48,12 @@
|
||||
|
||||
namespace cv { namespace dnn { namespace ocl4dnn {
|
||||
|
||||
enum gemm_data_type_t
|
||||
{
|
||||
TYPE_FLOAT = 1,
|
||||
TYPE_HALF = 2
|
||||
};
|
||||
|
||||
// Create and copy buffer to image for GEMM's matrix A and B.
|
||||
// Will return image to caller if the input image is NULL. Otherwise,
|
||||
// will use the image directly. It's caller's responsibility to
|
||||
@ -60,6 +66,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
||||
int width, int ld)
|
||||
{
|
||||
ocl::Image2D image;
|
||||
String opts = format("-DTYPE=%d", TYPE_FLOAT);
|
||||
|
||||
if (!is_matrix_a && transpose)
|
||||
{
|
||||
@ -73,7 +80,8 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
||||
UMat mat(height, width, CV_32FC1);
|
||||
image = ocl::Image2D(mat);
|
||||
|
||||
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc);
|
||||
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float",
|
||||
ocl::dnn::gemm_image_oclsrc, opts);
|
||||
|
||||
size_t global_copy[2];
|
||||
global_copy[0] = width;
|
||||
@ -96,7 +104,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
||||
image = ocl::Image2D(mat);
|
||||
|
||||
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
|
||||
ocl::dnn::gemm_image_oclsrc);
|
||||
ocl::dnn::gemm_image_oclsrc, opts);
|
||||
|
||||
size_t global_copy[2];
|
||||
global_copy[0] = padded_width;
|
||||
@ -129,7 +137,7 @@ enum gemm_type_t
|
||||
GEMM_TYPE_FAST_IMAGE_32_1,
|
||||
GEMM_TYPE_FAST_IMAGE_32_2,
|
||||
GEMM_TYPE_FAST_IMAGE_B_IMAGE,
|
||||
GEMM_TYPE_MAX
|
||||
GEMM_TYPE_FAST_BUFFER
|
||||
};
|
||||
|
||||
template<typename Dtype>
|
||||
@ -145,6 +153,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
|
||||
gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
|
||||
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
|
||||
if (is_image_a)
|
||||
{
|
||||
CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
|
||||
@ -157,6 +167,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
return false;
|
||||
}
|
||||
|
||||
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
|
||||
int widthA = (TransA == CblasNoTrans) ? K : M;
|
||||
int heightA = (TransA == CblasNoTrans) ? M : K;
|
||||
int widthB = (TransB == CblasNoTrans) ? N : K;
|
||||
@ -178,7 +189,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
int blockC_width = blocksize;
|
||||
int blockC_height = blocksize;
|
||||
|
||||
int use_buffer_indicator = 8;
|
||||
int use_buffer_indicator = (halfPrecisionMode) ? 16 : 8;
|
||||
// To fix the edge problem caused by the sub group block read.
|
||||
// we have to pad the image if it's not multiple of tile.
|
||||
// just padding one line is enough as the sub group block read
|
||||
@ -221,9 +232,13 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
else
|
||||
kernel_name += "1";
|
||||
|
||||
kernel_name += "_float";
|
||||
if (halfPrecisionMode) {
|
||||
kernel_name += "_half";
|
||||
} else {
|
||||
kernel_name += "_float";
|
||||
}
|
||||
|
||||
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc);
|
||||
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc, opts);
|
||||
if (oclk_gemm_float.empty())
|
||||
return false;
|
||||
|
||||
@ -255,6 +270,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
bool padding_A = false;
|
||||
bool padding_B = false;
|
||||
|
||||
if (halfPrecisionMode && is_image_b) {
|
||||
padding_A = true;
|
||||
}
|
||||
|
||||
if (!is_image_a && !is_image_b)
|
||||
{
|
||||
if (M * K < N * K)
|
||||
@ -265,17 +284,19 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
|
||||
if (!is_image_a)
|
||||
{
|
||||
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
|
||||
true, TransA != CblasNoTrans,
|
||||
padding_A, imageA_h, imageA_w,
|
||||
blockA_height, blockA_width, ldA);
|
||||
if (!halfPrecisionMode)
|
||||
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
|
||||
true, TransA != CblasNoTrans,
|
||||
padding_A, imageA_h, imageA_w,
|
||||
blockA_height, blockA_width, ldA);
|
||||
}
|
||||
if (!is_image_b)
|
||||
{
|
||||
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
|
||||
false, false,
|
||||
padding_B, imageB_h, imageB_w,
|
||||
blockB_height, blockB_width, ldB);
|
||||
if (!halfPrecisionMode)
|
||||
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
|
||||
false, false,
|
||||
padding_B, imageB_h, imageB_w,
|
||||
blockB_height, blockB_width, ldB);
|
||||
}
|
||||
} else {
|
||||
// We will use normal read_imagef to read image B when B has transpose.
|
||||
@ -283,32 +304,48 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
if (!is_image_a)
|
||||
{
|
||||
bool padding;
|
||||
padding = !is_image_b;
|
||||
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
|
||||
true, TransA != CblasNoTrans,
|
||||
padding, imageA_h, imageA_w,
|
||||
blockA_height, blockA_width, ldA);
|
||||
padding = !is_image_b || halfPrecisionMode;
|
||||
if (!halfPrecisionMode)
|
||||
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
|
||||
true, TransA != CblasNoTrans,
|
||||
padding, imageA_h, imageA_w,
|
||||
blockA_height, blockA_width, ldA);
|
||||
}
|
||||
|
||||
if (!is_image_b && (K % use_buffer_indicator != 0))
|
||||
{
|
||||
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
|
||||
false, true, false, imageB_h, imageB_w,
|
||||
blockB_height, blockB_width, ldB);
|
||||
if (!halfPrecisionMode)
|
||||
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
|
||||
false, true, false,
|
||||
imageB_h, imageB_w,
|
||||
blockB_height, blockB_width, ldB);
|
||||
}
|
||||
}
|
||||
|
||||
size_t global[2];
|
||||
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
|
||||
{
|
||||
global[0] = (size_t)( blockC_width + 7 ) & ~7;
|
||||
if (halfPrecisionMode) {
|
||||
global[0] = (size_t)( blockC_width + 15 ) & ~15;
|
||||
} else {
|
||||
global[0] = (size_t)( blockC_width + 7 ) & ~7;
|
||||
}
|
||||
} else {
|
||||
global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
|
||||
if (halfPrecisionMode) {
|
||||
global[0] = (size_t)( (blockC_width / 2 ) + 15 ) ^ ~15;
|
||||
} else {
|
||||
global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
|
||||
}
|
||||
}
|
||||
global[1] = (size_t)(blockC_height + 31) / 32;
|
||||
|
||||
size_t local[2];
|
||||
local[0] = 8;
|
||||
if (halfPrecisionMode)
|
||||
{
|
||||
local[0] = 16;
|
||||
} else {
|
||||
local[0] = 8;
|
||||
}
|
||||
local[1] = 1;
|
||||
|
||||
cl_uint arg_idx = 0;
|
||||
@ -385,6 +422,101 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
|
||||
const CBLAS_TRANSPOSE TransB, const int32_t M,
|
||||
const int32_t N, const int32_t K, const Dtype alpha,
|
||||
const UMat A, const int32_t offA, const UMat B,
|
||||
const int32_t offB, const Dtype beta, UMat C,
|
||||
const int32_t offC, enum gemm_type_t gemm_type)
|
||||
{
|
||||
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
|
||||
<< "Invalid fast buffer gemm type." << std::endl;
|
||||
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
|
||||
size_t sub_group_size = 8;
|
||||
bool is_small_batch = (M == 2 || M == 4 || M == 8);
|
||||
String kernel_name("gemm_buffer_");
|
||||
if (TransA == CblasNoTrans && TransB == CblasNoTrans) {
|
||||
kernel_name += "NN";
|
||||
if (halfPrecisionMode) {
|
||||
sub_group_size = 16;
|
||||
}
|
||||
} else if (TransA == CblasNoTrans && TransB != CblasNoTrans) {
|
||||
if (M == 2)
|
||||
kernel_name +="NT_M_2";
|
||||
else if (M == 4)
|
||||
kernel_name +="NT_M_4";
|
||||
else if (M == 8)
|
||||
kernel_name +="NT_M_8";
|
||||
else
|
||||
kernel_name += "NT";
|
||||
}
|
||||
|
||||
if (halfPrecisionMode) {
|
||||
kernel_name += "_half";
|
||||
} else {
|
||||
kernel_name += "_float";
|
||||
}
|
||||
|
||||
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
|
||||
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
|
||||
size_t local[2] = {};
|
||||
size_t global[2] = {};
|
||||
if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
|
||||
if (M == 8)
|
||||
local[0] = 16;
|
||||
else if (M == 4)
|
||||
local[0] = 32;
|
||||
else
|
||||
local[0] = 64;
|
||||
local[1] = 1;
|
||||
|
||||
if (M == 8)
|
||||
global[0] = N * local[0];
|
||||
else
|
||||
global[0] = (N + 3) / 4 * local[0];
|
||||
global[1] = 1;
|
||||
} else {
|
||||
size_t lx = sub_group_size;
|
||||
size_t ly = (TransB != CblasNoTrans && TransA == CblasNoTrans && halfPrecisionMode) ? 2 : 4;
|
||||
int dx = (TransB != CblasNoTrans && TransA == CblasNoTrans) ? 1 : 4;
|
||||
int dy = 8;
|
||||
size_t gx = (size_t)(N + dx - 1) / dx;
|
||||
size_t gy = (size_t)(M + dy - 1) / dy;
|
||||
global[0] = (gx + lx - 1) / lx * lx;
|
||||
global[1] = (gy + ly - 1) / ly * ly;
|
||||
local[0] = lx;
|
||||
local[1] = ly;
|
||||
}
|
||||
|
||||
int arg_idx = 0;
|
||||
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
|
||||
oclk_gemm_float.set(arg_idx++, offA);
|
||||
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
|
||||
oclk_gemm_float.set(arg_idx++, offB);
|
||||
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
|
||||
oclk_gemm_float.set(arg_idx++, offC);
|
||||
oclk_gemm_float.set(arg_idx++, M);
|
||||
oclk_gemm_float.set(arg_idx++, N);
|
||||
oclk_gemm_float.set(arg_idx++, K);
|
||||
oclk_gemm_float.set(arg_idx++, (float)alpha);
|
||||
oclk_gemm_float.set(arg_idx++, (float)beta);
|
||||
|
||||
bool ret;
|
||||
if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
|
||||
int stride = 256;
|
||||
for (int start_index = 0; start_index < K; start_index += stride) {
|
||||
oclk_gemm_float.set(arg_idx, start_index);
|
||||
ret = oclk_gemm_float.run(2, global, local, false);
|
||||
}
|
||||
} else {
|
||||
ret = oclk_gemm_float.run(2, global, local, false);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
|
||||
const int32_t M, const int32_t N, const int32_t K,
|
||||
@ -392,7 +524,8 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
|
||||
const UMat B_image, UMat C,
|
||||
const size_t max_image_size)
|
||||
{
|
||||
gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1;
|
||||
bool halfPrecisionMode = (A.depth() == CV_16S);
|
||||
gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
|
||||
|
||||
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
|
||||
gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
|
||||
@ -409,6 +542,11 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
|
||||
GEMM_TYPE_FAST_IMAGE_B_IMAGE,
|
||||
max_image_size);
|
||||
}
|
||||
else if (gemm_type == GEMM_TYPE_FAST_BUFFER)
|
||||
{
|
||||
return ocl4dnnFastBufferGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
|
||||
1.f, A, 0, B, 0, 0.f, C, 0, gemm_type);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -436,10 +574,17 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
|
||||
const int32_t offy)
|
||||
{
|
||||
bool ret = false;
|
||||
bool use_half = (A.depth() == CV_16S);
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
|
||||
else
|
||||
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "float", "float4", "float");
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
{
|
||||
ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc);
|
||||
String kname = format("matvec_mul4_%s", use_half ? "half" : "float");
|
||||
ocl::Kernel k(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
@ -469,7 +614,8 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
|
||||
|
||||
if ((row_size % 4) != 0 && ret)
|
||||
{
|
||||
ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc);
|
||||
String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
|
||||
ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
|
||||
size_t localsize[] = { 128 };
|
||||
size_t globalsize[] = { row_size % 4 * localsize[0] };
|
||||
uint row_offset = row_size - (row_size % 4);
|
||||
@ -499,7 +645,15 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
|
||||
const UMat X, const int32_t offX, UMat Y,
|
||||
const int32_t offY)
|
||||
{
|
||||
ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
|
||||
bool use_half = (X.depth() == CV_16S);
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
|
||||
else
|
||||
opts = "-DDtype=float -DDtype4=float4 -Dconvert_Dtype=convert_float";
|
||||
|
||||
String kname = format("axpy_%s", use_half ? "half" : "float");
|
||||
ocl::Kernel oclk_axpy(kname.c_str(), cv::ocl::dnn::math_oclsrc, opts);
|
||||
if (oclk_axpy.empty())
|
||||
return false;
|
||||
|
||||
|
@ -54,6 +54,7 @@
|
||||
#include "opencl_kernels_dnn.hpp"
|
||||
#include "../include/math_functions.hpp"
|
||||
#include "../include/default_kernel_config.hpp"
|
||||
#include "opencv2/dnn/shape_utils.hpp"
|
||||
|
||||
#if defined WIN32 || defined _WIN32
|
||||
#include <windows.h>
|
||||
@ -85,6 +86,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
|
||||
max_value_ = 0;
|
||||
prev_kernel_type_ = -1;
|
||||
tuned_ = false;
|
||||
use_half_ = config.use_half;
|
||||
|
||||
// assumption: spatial dimension is 2.
|
||||
kernel_h_ = config.kernel.height;
|
||||
@ -204,18 +206,40 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
|
||||
return;
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
TYPE_FLOAT = 1,
|
||||
TYPE_HALF = 2
|
||||
} ocl4dnnConvSpatialType_t;
|
||||
|
||||
template<typename Dtype>
|
||||
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
|
||||
{
|
||||
addDef("Dtype", "float");
|
||||
addDef("Dtype2", "float2");
|
||||
addDef("Dtype4", "float4");
|
||||
addDef("Dtype8", "float8");
|
||||
addDef("Dtype16", "float16");
|
||||
addDef("as_Dtype", "as_float");
|
||||
addDef("as_Dtype2", "as_float2");
|
||||
addDef("as_Dtype4", "as_float4");
|
||||
addDef("as_Dtype8", "as_float8");
|
||||
if (use_half_)
|
||||
{
|
||||
addDef("TYPE", TYPE_HALF);
|
||||
addDef("Dtype", "half");
|
||||
addDef("Dtype2", "half2");
|
||||
addDef("Dtype4", "half4");
|
||||
addDef("Dtype8", "half8");
|
||||
addDef("Dtype16", "half16");
|
||||
addDef("as_Dtype", "as_half");
|
||||
addDef("as_Dtype2", "as_half2");
|
||||
addDef("as_Dtype4", "as_half4");
|
||||
addDef("as_Dtype8", "as_half8");
|
||||
}
|
||||
else
|
||||
{
|
||||
addDef("TYPE", TYPE_FLOAT);
|
||||
addDef("Dtype", "float");
|
||||
addDef("Dtype2", "float2");
|
||||
addDef("Dtype4", "float4");
|
||||
addDef("Dtype8", "float8");
|
||||
addDef("Dtype16", "float16");
|
||||
addDef("as_Dtype", "as_float");
|
||||
addDef("as_Dtype2", "as_float2");
|
||||
addDef("as_Dtype4", "as_float4");
|
||||
addDef("as_Dtype8", "as_float8");
|
||||
}
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
@ -477,10 +501,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
|
||||
fused_eltwise_ = false;
|
||||
}
|
||||
|
||||
prepareKernel(bottom, top, weight, bias, numImages);
|
||||
if (use_half_ && bias_half.empty() && !bias.empty())
|
||||
convertFp16((UMat&)bias, bias_half);
|
||||
|
||||
if (use_half_ && weights_half.empty())
|
||||
convertFp16((UMat&)weight, weights_half);
|
||||
|
||||
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
|
||||
if (bestKernelConfig.empty())
|
||||
return false;
|
||||
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
|
||||
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
@ -556,6 +586,12 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
|
||||
<< "_" << blockWidth
|
||||
<< "_" << blockHeight
|
||||
<< "_" << blockDepth;
|
||||
|
||||
if (!use_half_)
|
||||
keyBuilder << "_float";
|
||||
else
|
||||
keyBuilder << "_half";
|
||||
|
||||
return keyBuilder.str();
|
||||
}
|
||||
|
||||
@ -637,9 +673,13 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
|
||||
if (swizzled_weights_umat.empty())
|
||||
swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
|
||||
kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1);
|
||||
kernel_h_ * (int)alignSize(kernel_w_, 2),
|
||||
(use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
|
||||
UMat swizzled_weights_tmp;
|
||||
if (use_half_)
|
||||
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
|
||||
|
||||
ocl::Queue queue = ocl::Queue::getDefault();
|
||||
if (!interleave) {
|
||||
cl_uint argIdx = 0;
|
||||
int32_t channels = channels_ / group_;
|
||||
@ -650,7 +690,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
return false;
|
||||
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
|
||||
if (use_half_)
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
|
||||
else
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
|
||||
oclk_copy_weight.set(argIdx++, kernel_w_);
|
||||
oclk_copy_weight.set(argIdx++, kernel_h_);
|
||||
oclk_copy_weight.set(argIdx++, channels);
|
||||
@ -669,7 +712,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
// assumption: kernel dimesion is 2
|
||||
Mat weightMat = weight.getMat(ACCESS_READ);
|
||||
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
|
||||
Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
|
||||
Mat swizzledWeightMat;
|
||||
if (use_half_)
|
||||
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
|
||||
else
|
||||
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
|
||||
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
|
||||
|
||||
int interleavedRows = (kernel_w_ / 2) * 2;
|
||||
@ -694,6 +741,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
rowAlignment);
|
||||
free(tmpSwizzledWeight);
|
||||
}
|
||||
|
||||
if (use_half_)
|
||||
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -727,9 +778,10 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
|
||||
cl_mem sub_mem;
|
||||
cl_buffer_region region;
|
||||
cl_int err;
|
||||
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
|
||||
|
||||
region.origin = offset * sizeof(float);
|
||||
region.size = size * sizeof(float);
|
||||
region.origin = offset * element_size;
|
||||
region.size = size * element_size;
|
||||
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
|
||||
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
@ -739,8 +791,9 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
|
||||
return;
|
||||
}
|
||||
|
||||
int step = sizeof(float), rows = size, cols = 1;
|
||||
ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer);
|
||||
int step = element_size, rows = size, cols = 1;
|
||||
ocl::convertFromBuffer(sub_mem, step, rows, cols,
|
||||
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
|
||||
|
||||
//decrease ocl mem refcount
|
||||
clReleaseMemObject(sub_mem);
|
||||
@ -978,7 +1031,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (use_half_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
||||
else
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
@ -1018,7 +1074,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (use_half_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
||||
else
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
kernel.set(argIdx++, kernel_offset);
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
@ -1132,14 +1191,27 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
|
||||
return false;
|
||||
|
||||
int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
|
||||
top.zeros(4, sz, CV_32FC1);
|
||||
top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
bool saved_tuned = tuned_;
|
||||
tuned_ = false;
|
||||
convolve(bottom, top, weight, bias, numImages, config);
|
||||
tuned_ = saved_tuned;
|
||||
|
||||
float *data = (float *)top.getMat(ACCESS_READ).ptr<float>();
|
||||
float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
|
||||
UMat new_top, new_verify_top;
|
||||
float *data, *verify_data;
|
||||
if (use_half_)
|
||||
{
|
||||
convertFp16(top, new_top);
|
||||
convertFp16(verifyTop, new_verify_top);
|
||||
|
||||
data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
|
||||
verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
|
||||
}
|
||||
else
|
||||
{
|
||||
data = (float *)top.getMat(ACCESS_READ).ptr<float>();
|
||||
verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
|
||||
}
|
||||
|
||||
for (int32_t n = 0; n < num_; ++n) {
|
||||
for (int32_t g = 0; g < group_; ++g) {
|
||||
@ -1148,9 +1220,19 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
|
||||
for (int h = 0; h < output_h_ && !verificationFail; h++)
|
||||
for (int w = 0; w < output_w_; w++) {
|
||||
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
|
||||
if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
|
||||
!(fabs(verify_data[offset]) < 1.e-3 &&
|
||||
fabs(data[offset] - verify_data[offset]) < 1.e-4))
|
||||
|
||||
float error_factor = fabs(data[offset] - verify_data[offset]);
|
||||
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
|
||||
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
|
||||
{
|
||||
dbgPrint(printf("test verification failed @ image %d group %d"
|
||||
"out_ch %d h %d w %d got %G expected %G\n",
|
||||
n, g, out_ch, h, w, data[offset], verify_data[offset]));
|
||||
verificationFail = 1;
|
||||
goto out;
|
||||
}
|
||||
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
|
||||
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
|
||||
{
|
||||
dbgPrint(printf("test verification failed @ image %d group %d"
|
||||
"out_ch %d h %d w %d got %G expected %G\n",
|
||||
@ -1719,15 +1801,16 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
|
||||
if (loadTunedConfig()) // check external storage
|
||||
return;
|
||||
|
||||
UMat benchData(1, numImages * top_dim_, CV_32FC1);
|
||||
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
|
||||
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
|
||||
|
||||
if (force_auto_tuning_)
|
||||
{
|
||||
calculateBenchmark(bottom, benchData, weight, bias, numImages);
|
||||
setupConvolution(bottom, top, weight, bias, numImages, benchData);
|
||||
}
|
||||
else
|
||||
{
|
||||
calculateBenchmark(bottom, benchData, weight, bias, numImages);
|
||||
useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
|
||||
}
|
||||
cacheTunedConfig();
|
||||
|
@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
|
||||
K_ = config.K;
|
||||
phase_test_ = config.phase_test;
|
||||
image_copied_ = false;
|
||||
use_half_ = config.use_half;
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
|
||||
if (M_ <= max_image_size &&
|
||||
N_ <= max_image_size &&
|
||||
K_ <= max_image_size &&
|
||||
cv::traits::Depth<Dtype>::value == CV_32F &&
|
||||
ocl::Device::getDefault().intelSubgroupsSupport())
|
||||
{
|
||||
ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
|
||||
M_, N_, K_, bottom, weight, UMat(), top,
|
||||
max_image_size);
|
||||
}
|
||||
|
||||
if (use_half_ && bias_term_)
|
||||
{
|
||||
UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
|
||||
UMat newbias, tmpTop;
|
||||
|
||||
convertFp16(bias, newbias);
|
||||
convertFp16(top, tmpTop);
|
||||
cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
|
||||
convertFp16(tmpTop, top);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
|
||||
channels_ = config.channels;
|
||||
height_ = config.height;
|
||||
width_ = config.width;
|
||||
use_half_ = config.use_half;
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
|
||||
int32_t n_threads = num_ * height_ * width_;
|
||||
size_t global_work_size_[1] = {(size_t)n_threads};
|
||||
String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
|
||||
opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
|
||||
ocl::Kernel oclk_lrn_fill;
|
||||
if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
|
||||
String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
|
||||
if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
|
||||
return false;
|
||||
|
||||
oclk_lrn_fill.set(argIdx++, n_threads);
|
||||
|
@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
|
||||
channels_ = config.channels;
|
||||
pool_method_ = config.pool_method;
|
||||
avePoolPaddedArea = config.avePoolPaddedArea;
|
||||
use_half = config.use_half;
|
||||
|
||||
for (int i = 0; i < spatial_dims; ++i)
|
||||
{
|
||||
@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
case LIBDNN_POOLING_METHOD_MAX:
|
||||
{
|
||||
bool haveMask = !top_mask.empty();
|
||||
String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
|
||||
kname += (use_half) ? "_half" : "_float";
|
||||
ocl::Kernel oclk_max_pool_forward(
|
||||
haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
|
||||
kname.c_str(),
|
||||
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||
format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||
format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||
" -D STRIDE_W=%d -D STRIDE_H=%d"
|
||||
" -D PAD_W=%d -D PAD_H=%d%s",
|
||||
(use_half) ? "half" : "float",
|
||||
kernel_w_, kernel_h_,
|
||||
stride_w_, stride_h_,
|
||||
pad_w_, pad_h_,
|
||||
@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
{
|
||||
CV_Assert(top_mask.empty());
|
||||
|
||||
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
|
||||
String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
|
||||
ocl::Kernel oclk_ave_pool_forward(
|
||||
kname.c_str(),
|
||||
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||
format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||
" -D STRIDE_W=%d -D STRIDE_H=%d"
|
||||
" -D PAD_W=%d -D PAD_H=%d%s",
|
||||
(use_half) ? "half" : "float",
|
||||
kernel_w_, kernel_h_,
|
||||
stride_w_, stride_h_,
|
||||
pad_w_, pad_h_,
|
||||
@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
{
|
||||
CV_Assert(top_mask.empty());
|
||||
|
||||
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
|
||||
String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
|
||||
ocl::Kernel oclk_sto_pool_forward(
|
||||
kname.c_str(),
|
||||
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||
" -D STRIDE_W=%d -D STRIDE_H=%d",
|
||||
|
@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
|
||||
softmax_axis_ = config.axis;
|
||||
channels_ = config.channels;
|
||||
log_softmax_ = config.logsoftmax;
|
||||
use_half_ = config.use_half;
|
||||
|
||||
inner_num_ = 1;
|
||||
outer_num_ = 1;
|
||||
@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
|
||||
|
||||
if (log_softmax_) opts += " -DLOG_SOFTMAX ";
|
||||
if (use_slm_)
|
||||
kname = CL_KERNEL_SELECT("softmax_forward_slm");
|
||||
kname = "softmax_forward_slm";
|
||||
else
|
||||
kname = CL_KERNEL_SELECT("softmax_forward");
|
||||
kname = "softmax_forward";
|
||||
|
||||
kname += format("%s", (use_half_) ? "_half" : "_float");
|
||||
opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
|
||||
(use_half_) ? "HALF_MAX" : "FLT_MAX");
|
||||
if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
|
||||
return false;
|
||||
|
||||
|
@ -40,9 +40,17 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void ReLUForward(const int count, __global const T* in, __global T* out
|
||||
#ifndef RELU_NO_SLOPE
|
||||
, T negative_slope
|
||||
, KERNEL_ARG_DTYPE negative_slope
|
||||
#endif
|
||||
) {
|
||||
int index = get_global_id(0);
|
||||
@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
|
||||
}
|
||||
|
||||
__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
|
||||
const T minValue, const T maxValue)
|
||||
const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
{
|
||||
T x = in[index];
|
||||
out[index] = clamp(x, minValue, maxValue);
|
||||
out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void PReLUForward(const int count, const int channels, const int plane_size,
|
||||
__global const T* in, __global T* out, __global const T* slope_data)
|
||||
__global const T* in, __global T* out,
|
||||
__global const KERNEL_ARG_DTYPE* slope_data)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
int c = (index / plane_size) % channels;
|
||||
@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
|
||||
out[index] = fabs(in[index]);
|
||||
}
|
||||
|
||||
__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) {
|
||||
__kernel void PowForward(const int n, __global const T* in, __global T* out,
|
||||
const KERNEL_ARG_DTYPE power,
|
||||
const KERNEL_ARG_DTYPE scale,
|
||||
const KERNEL_ARG_DTYPE shift)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index < n)
|
||||
out[index] = pow(shift + scale * in[index], power);
|
||||
}
|
||||
|
||||
__kernel void ELUForward(const int n, __global const T* in, __global T* out)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index < n)
|
||||
{
|
||||
T src = in[index];
|
||||
out[index] = (src >= 0.f) ? src : exp(src) - 1;
|
||||
}
|
||||
}
|
||||
|
@ -40,24 +40,27 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if NUM == 8
|
||||
#define load(src, index) vload8(0, src + index)
|
||||
#define store(vec, dst, index) vstore8(vec, 0, dst + index)
|
||||
#define vec_type Dtype8
|
||||
#define float_type float8
|
||||
#define convert_f convert_float8
|
||||
#define BATCH_NORM batch_norm8
|
||||
#elif NUM == 4
|
||||
#define load(src, index) vload4(0, src + index)
|
||||
#define store(vec, dst, index) vstore4(vec, 0, dst + index)
|
||||
#define vec_type Dtype4
|
||||
#define float_type float4
|
||||
#define convert_f convert_float4
|
||||
#define BATCH_NORM batch_norm4
|
||||
#elif NUM == 1
|
||||
#define load(src, index) src[index]
|
||||
#define store(vec, dst, index) dst[index] = vec
|
||||
#define vec_type Dtype
|
||||
#define float_type float
|
||||
#define convert_f convert_float
|
||||
#define BATCH_NORM batch_norm1
|
||||
#endif
|
||||
|
||||
@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int channels,
|
||||
__global const Dtype* weight,
|
||||
__global const Dtype* bias,
|
||||
__global const float* weight,
|
||||
__global const float* bias,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
|
||||
if (x >= rows || y >= cols)
|
||||
return;
|
||||
|
||||
Dtype w = weight[x % channels];
|
||||
Dtype b = bias[x % channels];
|
||||
vec_type src_vec = load(src, index);
|
||||
vec_type dst_vec = src_vec * w + (vec_type)b;
|
||||
store(dst_vec, dst, index);
|
||||
float w = weight[x % channels];
|
||||
float b = bias[x % channels];
|
||||
float_type src_vec = convert_f(load(src, index));
|
||||
float_type dst_vec = src_vec * w + (float_type)b;
|
||||
store(convert_T(dst_vec), dst, index);
|
||||
}
|
||||
|
@ -39,22 +39,29 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void concat(const int nthreads,
|
||||
__global const Dtype* in_data,
|
||||
const int num_concats,
|
||||
const int concat_size,
|
||||
const int top_concat_axis,
|
||||
const int bottom_concat_axis,
|
||||
const int offset_concat_axis,
|
||||
__global Dtype* out_data) {
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0)) {
|
||||
const int total_concat_size = concat_size * bottom_concat_axis;
|
||||
const int concat_num = index / total_concat_size;
|
||||
const int concat_index = index % total_concat_size;
|
||||
const int top_index = concat_index
|
||||
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
|
||||
out_data[top_index] = in_data[index];
|
||||
}
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
|
||||
__global const Dtype* in_data,
|
||||
const int num_concats,
|
||||
const int concat_size,
|
||||
const int top_concat_axis,
|
||||
const int bottom_concat_axis,
|
||||
const int offset_concat_axis,
|
||||
__global Dtype* out_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
const int total_concat_size = concat_size * bottom_concat_axis;
|
||||
const int concat_num = index / total_concat_size;
|
||||
const int concat_index = index % total_concat_size;
|
||||
const int top_index = concat_index +
|
||||
(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
|
||||
out_data[top_index] = in_data[index];
|
||||
}
|
||||
}
|
||||
|
@ -40,27 +40,29 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if APPLY_BIAS
|
||||
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
||||
#else
|
||||
#define BIAS_KERNEL_ARG
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
#define TYPE_FLOAT 1
|
||||
#define TYPE_HALF 2
|
||||
|
||||
#if defined(FUSED_CONV_RELU)
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
|
||||
#define FUSED_ARG Dtype negative_slope,
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
|
||||
#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
|
||||
#elif defined(FUSED_CONV_PRELU)
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
|
||||
#define FUSED_ARG __global const Dtype *negative_slope,
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
|
||||
#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
|
||||
#elif defined(FUSED_CONV_POWER)
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
|
||||
#define FUSED_ARG Dtype power,
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
|
||||
#define FUSED_ARG KERNEL_ARG_DTYPE power,
|
||||
#elif defined(FUSED_CONV_TANH)
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
|
||||
#define FUSED_ARG
|
||||
#elif defined(FUSED_CONV_RELU6)
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
|
||||
#define FUSED_ARG Dtype min_value, Dtype max_value,
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
|
||||
#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
|
||||
#else
|
||||
#define ACTIVATION_RELU_FUNCTION(x, c) (x)
|
||||
#define FUSED_ARG
|
||||
@ -74,6 +76,11 @@
|
||||
#define ELTWISE_DATA_ARG
|
||||
#endif
|
||||
|
||||
#if APPLY_BIAS
|
||||
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
||||
#else
|
||||
#define BIAS_KERNEL_ARG
|
||||
#endif
|
||||
|
||||
#define __CAT(x, y) x##y
|
||||
#define CAT(x, y) __CAT(x, y)
|
||||
@ -97,6 +104,16 @@
|
||||
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
|
||||
|
||||
#if defined(convolve_simd) || defined(Conv_Interleaved)
|
||||
#if TYPE == TYPE_HALF
|
||||
#define INT_TYPE ushort
|
||||
#define INT_TYPE2 ushort2
|
||||
#define INT_TYPE4 ushort4
|
||||
#define INT_TYPE8 ushort8
|
||||
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
|
||||
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
|
||||
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
|
||||
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
|
||||
#else
|
||||
#define INT_TYPE uint
|
||||
#define INT_TYPE2 uint2
|
||||
#define INT_TYPE4 uint4
|
||||
@ -106,6 +123,7 @@
|
||||
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
|
||||
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KERNEL_BASIC
|
||||
|
||||
@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
|
||||
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
|
||||
typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
|
||||
|
||||
typedef struct half1 { half s0; } half1;
|
||||
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
|
||||
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
|
||||
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
|
||||
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
|
||||
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; } half10;
|
||||
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; half sa; } half11;
|
||||
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; half sa; half sb; } half12;
|
||||
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
|
||||
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
|
||||
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
|
||||
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
|
||||
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
|
||||
|
||||
#define OUT_PITCH_X output_width
|
||||
#define ROW_PITCH input_width
|
||||
|
||||
|
@ -40,9 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void op_sum4(__global const Dtype * A,
|
||||
__global const Dtype * B,
|
||||
@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
|
||||
a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = a0 * coeff1 + b0 * coeff2;
|
||||
dot1 = a1 * coeff1 + b1 * coeff2;
|
||||
dot2 = a2 * coeff1 + b2 * coeff2;
|
||||
dot3 = a3 * coeff1 + b3 * coeff2;
|
||||
dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
|
||||
dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
|
||||
dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
|
||||
dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
|
||||
#else
|
||||
a0 = vload4(i, dst0_read);
|
||||
a1 = vload4(i, dst0_read + A_col_size);
|
||||
a2 = vload4(i, dst0_read + 2 * A_col_size);
|
||||
a3 = vload4(i, dst0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = a0 + b0 * coeff2;
|
||||
dot1 = a1 + b1 * coeff2;
|
||||
dot2 = a2 + b2 * coeff2;
|
||||
dot3 = a3 + b3 * coeff2;
|
||||
dot0 = a0 + b0 * (Dtype4)coeff2;
|
||||
dot1 = a1 + b1 * (Dtype4)coeff2;
|
||||
dot2 = a2 + b2 * (Dtype4)coeff2;
|
||||
dot3 = a3 + b3 * (Dtype4)coeff2;
|
||||
#endif
|
||||
vstore4(dot0, i, dst0_read);
|
||||
vstore4(dot1, i, dst0_read + A_col_size);
|
||||
|
1342
modules/dnn/src/opencl/gemm_buffer.cl
Normal file
@ -39,24 +39,42 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
// Types used for parameters, offset computations and so on
|
||||
#define int_tp int
|
||||
#define uint_tp unsigned int
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
#define TYPE_FLOAT 1
|
||||
#define TYPE_HALF 2
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define Dtype half
|
||||
#define Dtype2 half2
|
||||
#define Dtype4 half4
|
||||
#define Dtype8 half8
|
||||
#define Dtype16 half16
|
||||
|
||||
#define as_Dtype as_half
|
||||
#define as_Dtype2 as_half2
|
||||
#define as_Dtype4 as_half4
|
||||
#define as_Dtype8 as_half8
|
||||
#define as_Dtype16 as_half16
|
||||
#else
|
||||
#define Dtype float
|
||||
#define Dtype2 float2
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#define Dtype16 float16
|
||||
|
||||
#define as_Dtype as_float
|
||||
#define as_Dtype2 as_float2
|
||||
#define as_Dtype4 as_float4
|
||||
#define as_Dtype8 as_float8
|
||||
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
#define as_Dtype16 as_float16
|
||||
#endif
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
@ -67,6 +85,15 @@
|
||||
|
||||
// common block to calculate (alpha * AxB + beta * C) and output to destination image.
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
|
||||
#define SHUFFLE_TYPE2(val) as_ushort2(val)
|
||||
#define SHUFFLE_TYPE8(val) as_ushort8(val)
|
||||
#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
|
||||
#define SIZE_OF_ELEMENT sizeof(ushort)
|
||||
#define SIMD_SIZE_GEMM 16
|
||||
#define TILE_N 16
|
||||
#else
|
||||
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
|
||||
#define SHUFFLE_TYPE2(val) val
|
||||
#define SHUFFLE_TYPE8(val) val
|
||||
@ -74,11 +101,17 @@
|
||||
#define SIZE_OF_ELEMENT sizeof(uint)
|
||||
#define SIMD_SIZE_GEMM 8
|
||||
#define TILE_N 8
|
||||
#endif
|
||||
|
||||
//#define USE_IMAGE_C
|
||||
#ifdef USE_IMAGE_C
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
|
||||
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
|
||||
#else
|
||||
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
|
||||
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
|
||||
#endif
|
||||
#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
|
||||
#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
|
||||
#else
|
||||
@ -139,10 +172,10 @@
|
||||
blockC03 += blockAxB03; \
|
||||
} \
|
||||
} else { \
|
||||
blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
|
||||
blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
|
||||
if (!ALPHA1) { \
|
||||
blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
|
||||
blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
|
||||
@ -172,6 +205,43 @@
|
||||
intel_sub_group_shuffle( _block.s7, _col ) );
|
||||
|
||||
// A's column block multiply B 's row block.
|
||||
#if TYPE == TYPE_HALF
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
|
||||
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
|
||||
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
|
||||
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
|
||||
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
|
||||
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
|
||||
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
|
||||
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s0), acol0, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s1), acol1, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s2), acol2, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s3), acol3, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s4), acol4, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s5), acol5, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s6), acol6, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s7), acol7, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s0), acol8, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s1), acol9, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s2), acola, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s3), acolb, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s4), acolc, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s5), acold, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s6), acole, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s7), acolf, _result ); \
|
||||
}
|
||||
#else
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
@ -191,7 +261,50 @@
|
||||
_result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_NN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
|
||||
do \
|
||||
{ \
|
||||
int2 coordBTemp = coordB; \
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
|
||||
Dtype8 blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_NN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
|
||||
GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
|
||||
@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
|
||||
_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \
|
||||
}
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_TN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0);\
|
||||
const int group_y = get_group_id(1);\
|
||||
Dtype8 blockAxB00 = 0;\
|
||||
Dtype8 blockAxB01 = 0;\
|
||||
Dtype8 blockAxB02 = 0;\
|
||||
Dtype8 blockAxB03 = 0;\
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
|
||||
do\
|
||||
{\
|
||||
int2 coordBTemp = coordB;\
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
|
||||
int2 coordATemp = coordA;\
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_TN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
|
||||
GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
|
||||
@ -324,6 +478,43 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
|
||||
intel_sub_group_shuffle( _block.s6, _col), \
|
||||
intel_sub_group_shuffle( _block.s7, _col) )
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
|
||||
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
|
||||
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
|
||||
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
|
||||
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
|
||||
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
|
||||
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
|
||||
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
|
||||
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s8, acol8, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s9, acol9, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sa, acola, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sb, acolb, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sc, acolc, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sd, acold, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.se, acole, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sf, acolf, _result ); \
|
||||
}
|
||||
#else
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
@ -343,7 +534,51 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
|
||||
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype16 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
|
||||
_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
|
||||
_coordB.x += TILE_K * 2;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
|
||||
_blockb = vload8(0, B_read); \
|
||||
_coordB.x += TILE_K;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
|
||||
|
||||
@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
Dtype4 temp; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s0 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s1 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s2 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s3 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s5 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s6 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s7 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s8 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s9 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sa = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sb = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sc = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sd = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.se = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sf = temp.s0; \
|
||||
_coordB.x += 16;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s7 = temp.s0; \
|
||||
_coordB.x += 8;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
}
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype8 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
|
||||
}
|
||||
#else
|
||||
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
|
||||
}
|
||||
#endif
|
||||
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
|
||||
_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
|
||||
_coordB.x += TILE_K;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
|
||||
_blockb = vload8(0, B_read); \
|
||||
_coordB.x += TILE_K;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
|
||||
|
||||
@ -598,7 +944,7 @@ GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
|
||||
#undef READ_IMAGE
|
||||
#undef SIZE_OF_ELEMENT
|
||||
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
@ -611,10 +957,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
__global Dtype* A_off = A + offA;
|
||||
Dtype srcA = A_off[gidy * ldA + gidx];
|
||||
#if TYPE == TYPE_HALF
|
||||
write_imageh(ImA, coord_dst, (Dtype4)srcA);
|
||||
#else
|
||||
write_imagef(ImA, coord_dst, (Dtype4)srcA);
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
#if TYPE == TYPE_HALF
|
||||
if (gidx >= width || gidy >= height) {
|
||||
write_imageh(ImA, coord_dst, 0);
|
||||
return;
|
||||
}
|
||||
__global Dtype* A_off = A + offA;
|
||||
write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
|
||||
#else
|
||||
if (gidx >= width || gidy >= height) {
|
||||
write_imageui(ImA, coord_dst, (uint4)0);
|
||||
return;
|
||||
@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
|
||||
__global Dtype* A_off = A + offA;
|
||||
uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
|
||||
write_imageui(ImA, coord_dst, srcA);
|
||||
#endif
|
||||
}
|
||||
|
@ -40,16 +40,20 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
|
||||
__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
|
||||
const int offx, __global Dtype* y,
|
||||
const int offy) {
|
||||
for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
|
||||
Dtype src = x[offx + index];
|
||||
Dtype dst = y[offy + index];
|
||||
y[offy + index] = alpha * src + dst;
|
||||
y[offy + index] = convert_Dtype(alpha) * src + dst;
|
||||
}
|
||||
}
|
||||
|
@ -39,41 +39,45 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
__kernel void TEMPLATE(matvec_mul4,Dtype)(
|
||||
__global const float * A,
|
||||
__global const Dtype * A,
|
||||
int offA,
|
||||
unsigned int A_col_size,
|
||||
unsigned int trail_item,
|
||||
__global const float * v,
|
||||
__global const Dtype * v,
|
||||
int offv,
|
||||
float alpha,
|
||||
float beta,
|
||||
__global float4 * result,
|
||||
KERNEL_ARG_DTYPE alpha,
|
||||
KERNEL_ARG_DTYPE beta,
|
||||
__global Dtype4* result,
|
||||
int offr,
|
||||
__local float4 * work)
|
||||
__local Dtype4* work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
|
||||
const __global float *src1_read = v + offv;
|
||||
result = (__global float4*)((__global float*)result + offr);
|
||||
float4 dot0 = (float4)(0.f);
|
||||
float4 dot1 = (float4)(0.f);
|
||||
float4 dot2 = (float4)(0.f);
|
||||
float4 dot3 = (float4)(0.f);
|
||||
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
|
||||
const __global Dtype *src1_read = v + offv;
|
||||
result = (__global Dtype4*)((__global Dtype*)result + offr);
|
||||
Dtype4 dot0 = (Dtype4)(0.f);
|
||||
Dtype4 dot1 = (Dtype4)(0.f);
|
||||
Dtype4 dot2 = (Dtype4)(0.f);
|
||||
Dtype4 dot3 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
while( i < A_col_size / 4) {
|
||||
const float4 a0 = vload4(i, src0_read);
|
||||
const float4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
const float4 b0 = vload4(i, src1_read);
|
||||
const Dtype4 b0 = vload4(i, src1_read);
|
||||
|
||||
dot0 += a0 * b0;
|
||||
dot1 += a1 * b0;
|
||||
@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
|
||||
{
|
||||
if(trail_item != 0)
|
||||
{
|
||||
const __global float *src0_trail = src0_read + i * 4;
|
||||
const __global float *src1_trail = src1_read + i * 4;
|
||||
const __global Dtype *src0_trail = src0_read + i * 4;
|
||||
const __global Dtype *src1_trail = src1_read + i * 4;
|
||||
for(unsigned int i = 0; i < trail_item; ++i) {
|
||||
const float at0 = src0_trail[i];
|
||||
const float at1 = src0_trail[i + A_col_size];
|
||||
const float at2 = src0_trail[i + 2 * A_col_size];
|
||||
const float at3 = src0_trail[i + 3 * A_col_size];
|
||||
const Dtype at0 = src0_trail[i];
|
||||
const Dtype at1 = src0_trail[i + A_col_size];
|
||||
const Dtype at2 = src0_trail[i + 2 * A_col_size];
|
||||
const Dtype at3 = src0_trail[i + 3 * A_col_size];
|
||||
|
||||
const float bt = src1_trail[i];
|
||||
const Dtype bt = src1_trail[i];
|
||||
|
||||
work[lid].s0 += at0 * bt;
|
||||
work[lid].s1 += at1 * bt;
|
||||
@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
|
||||
}
|
||||
if(lid == 0) {
|
||||
if(beta == (Dtype)0)
|
||||
result[row_gid] = alpha * work[0];
|
||||
result[row_gid] = convert_Dtype(alpha) * work[0];
|
||||
else
|
||||
result[row_gid] = alpha * work[0] + beta * result[row_gid];
|
||||
result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
|
||||
}
|
||||
}
|
||||
|
||||
/* This kernel used for the trailing rows when row_of_A %4 !=0 */
|
||||
__kernel void TEMPLATE(matvec_mul1,Dtype)(
|
||||
__global const float * A,
|
||||
__global const Dtype * A,
|
||||
int offA,
|
||||
unsigned int A_col_size,
|
||||
unsigned int row_offset,
|
||||
unsigned int trail_item,
|
||||
__global const float * v,
|
||||
__global const Dtype * v,
|
||||
int offv,
|
||||
float alpha,
|
||||
float beta,
|
||||
__global float * result,
|
||||
KERNEL_ARG_DTYPE alpha,
|
||||
KERNEL_ARG_DTYPE beta,
|
||||
__global Dtype * result,
|
||||
int offr,
|
||||
__local float * work)
|
||||
__local Dtype * work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
|
||||
const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
|
||||
const __global float *src1_read = v + + offv;
|
||||
const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
|
||||
const __global Dtype *src1_read = v + + offv;
|
||||
result = result + offr;
|
||||
float4 dot0 = (float4)(0.f);
|
||||
Dtype4 dot0 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const float4 a0 = vload4(i, src0_read);
|
||||
const float4 b0 = vload4(i, src1_read);
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 b0 = vload4(i, src1_read);
|
||||
|
||||
dot0 += a0 * b0;
|
||||
i += get_local_size(0);
|
||||
@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
|
||||
{
|
||||
if(trail_item != 0)
|
||||
{
|
||||
const __global float *src0_trail = src0_read + i * 4;
|
||||
const __global float *src1_trail = src1_read + i * 4;
|
||||
const __global Dtype *src0_trail = src0_read + i * 4;
|
||||
const __global Dtype *src1_trail = src1_read + i * 4;
|
||||
for(unsigned int i = 0; i < trail_item; ++i) {
|
||||
const float at0 = src0_trail[i];
|
||||
const float bt = src1_trail[i];
|
||||
const Dtype at0 = src0_trail[i];
|
||||
const Dtype bt = src1_trail[i];
|
||||
|
||||
work[lid] += at0 * bt;
|
||||
}
|
||||
@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
|
||||
|
||||
if(lid == 0) {
|
||||
if(beta == (Dtype)0) {
|
||||
result[row_gid+row_offset] = alpha * work[0];
|
||||
result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
|
||||
} else {
|
||||
result[row_gid+row_offset] *= beta;
|
||||
result[row_gid+row_offset] += alpha * work[0];
|
||||
result[row_gid+row_offset] *= convert_Dtype(beta);
|
||||
result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -40,7 +40,11 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
|
||||
@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
|
||||
store(dst_vec, dst, index);
|
||||
}
|
||||
|
||||
__kernel void MEAN_FUSE(__global const Dtype * A,
|
||||
__kernel void MEAN_FUSE(__global const T * A,
|
||||
unsigned int A_col_size,
|
||||
float alpha,
|
||||
__global Dtype4 * result,
|
||||
__global Dtype * B,
|
||||
__global T4 * mean,
|
||||
__global Dtype * tmp,
|
||||
__local Dtype4 * work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
|
||||
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
|
||||
const __global T *src0_read = A + row_gid * 4 * A_col_size;
|
||||
__global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
|
||||
Dtype4 dot0, dot1, dot2, dot3;
|
||||
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
|
||||
|
||||
@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
|
||||
const Dtype4 b0 = (Dtype4)1.f;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
const T4 a0 = vload4(i, src0_read);
|
||||
const T4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 += a0;
|
||||
dot1 += a1;
|
||||
dot2 += a2;
|
||||
dot3 += a3;
|
||||
dot0 += convert_float4(a0);
|
||||
dot1 += convert_float4(a1);
|
||||
dot2 += convert_float4(a2);
|
||||
dot3 += convert_float4(a3);
|
||||
|
||||
i += get_local_size(0);
|
||||
}
|
||||
@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
|
||||
|
||||
if(lid == 0)
|
||||
{
|
||||
result[row_gid] = alpha * work[0];
|
||||
mean[row_gid] = convert_T(alpha * work[0]);
|
||||
}
|
||||
|
||||
Dtype4 sum = work[0] * alpha;
|
||||
i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
const T4 a0 = vload4(i, src0_read);
|
||||
const T4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = native_powr(a0 - (Dtype4)sum.x, 2);
|
||||
dot1 = native_powr(a1 - (Dtype4)sum.y, 2);
|
||||
dot2 = native_powr(a2 - (Dtype4)sum.z, 2);
|
||||
dot3 = native_powr(a3 - (Dtype4)sum.w, 2);
|
||||
dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
|
||||
dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
|
||||
dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
|
||||
dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
|
||||
|
||||
vstore4(dot0, i, dst0_read);
|
||||
vstore4(dot1, i, dst0_read + A_col_size);
|
||||
@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
|
||||
}
|
||||
|
||||
__kernel void MVN_FUSE(__global const Dtype * tmp,
|
||||
__global const Dtype * A,
|
||||
__global const Dtype4 * mean,
|
||||
__global const T * A,
|
||||
__global const T4 * mean,
|
||||
unsigned int A_col_size,
|
||||
const float alpha_val,
|
||||
const float eps,
|
||||
const float relu_slope,
|
||||
__global const Dtype4 * bnorm_weight,
|
||||
__global const Dtype4 * bnorm_bias,
|
||||
__global Dtype * B,
|
||||
__global T * B,
|
||||
__local Dtype4 * work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
|
||||
const __global Dtype *src1_read = A + row_gid * 4 * A_col_size;
|
||||
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
|
||||
const __global T *src1_read = A + row_gid * 4 * A_col_size;
|
||||
__global T *dst0_read = B + row_gid * 4 * A_col_size;
|
||||
Dtype4 dot0, dot1, dot2, dot3;
|
||||
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
|
||||
|
||||
@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
Dtype4 mean_val = mean[row_gid];
|
||||
Dtype4 mean_val = convert_float4(mean[row_gid]);
|
||||
Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
|
||||
Dtype4 alpha = (Dtype4)1.f / dev_val;
|
||||
|
||||
@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
|
||||
i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 a0 = vload4(i, src1_read);
|
||||
const Dtype4 a1 = vload4(i, src1_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size);
|
||||
const T4 a0 = vload4(i, src1_read);
|
||||
const T4 a1 = vload4(i, src1_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
|
||||
|
||||
dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x;
|
||||
dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y;
|
||||
dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z;
|
||||
dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w;
|
||||
dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
|
||||
dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
|
||||
dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
|
||||
dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
|
||||
|
||||
dot0 = dot0 * w.x + (Dtype4)b.x;
|
||||
dot1 = dot1 * w.y + (Dtype4)b.y;
|
||||
@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
|
||||
dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
|
||||
#endif
|
||||
|
||||
vstore4(dot0, i, dst0_read);
|
||||
vstore4(dot1, i, dst0_read + A_col_size);
|
||||
vstore4(dot2, i, dst0_read + 2 * A_col_size);
|
||||
vstore4(dot3, i, dst0_read + 3 * A_col_size);
|
||||
vstore4(convert_T(dot0), i, dst0_read);
|
||||
vstore4(convert_T(dot1), i, dst0_read + A_col_size);
|
||||
vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
|
||||
vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
|
||||
|
||||
i += get_local_size(0);
|
||||
}
|
||||
|
@ -42,14 +42,18 @@
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
|
||||
const int num, const int channels,
|
||||
const int height, const int width, const int size,
|
||||
const Dtype alpha_over_size, const Dtype k,
|
||||
const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
|
||||
__global Dtype* const out,
|
||||
const Dtype negative_beta) {
|
||||
const KERNEL_ARG_DTYPE negative_beta) {
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0)) {
|
||||
// find out the local offset
|
||||
@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
|
||||
const int step = height * width;
|
||||
__global const Dtype* in_off = in + offset;
|
||||
__global Dtype* out_off = out + offset;
|
||||
Dtype scale_val;
|
||||
KERNEL_ARG_DTYPE scale_val;
|
||||
int head = 0;
|
||||
const int pre_pad = (size - 1) / 2;
|
||||
const int post_pad = size - pre_pad - 1;
|
||||
Dtype accum_scale = 0;
|
||||
KERNEL_ARG_DTYPE accum_scale = 0;
|
||||
// fill the scale at [n, :, h, w]
|
||||
// accumulate values
|
||||
while (head < post_pad && head < channels) {
|
||||
@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
|
||||
* in_off[(head - size) * step];
|
||||
}
|
||||
scale_val = k + accum_scale * alpha_over_size;
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
|
||||
++head;
|
||||
}
|
||||
// subtract only
|
||||
@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
|
||||
* in_off[(head - size) * step];
|
||||
}
|
||||
scale_val = k + accum_scale * alpha_over_size;
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
|
||||
++head;
|
||||
}
|
||||
}
|
||||
|
@ -42,7 +42,10 @@
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if defined KERNEL_MAX_POOL
|
||||
|
||||
|
@ -40,7 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void permute(const int nthreads,
|
||||
__global Dtype* bottom_data,
|
||||
|
@ -39,17 +39,18 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void prior_box(const int nthreads,
|
||||
const Dtype stepX,
|
||||
const Dtype stepY,
|
||||
__global const Dtype* _offsetsX,
|
||||
__global const Dtype* _offsetsY,
|
||||
const float stepX,
|
||||
const float stepY,
|
||||
__global const float* _offsetsX,
|
||||
__global const float* _offsetsY,
|
||||
const int offsetsX_size,
|
||||
__global const Dtype* _widths,
|
||||
__global const Dtype* _heights,
|
||||
__global const float* _widths,
|
||||
__global const float* _heights,
|
||||
const int widths_size,
|
||||
__global Dtype* dst,
|
||||
const int _layerHeight,
|
||||
@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
|
||||
|
||||
outputPtr = dst + index * 4 * offsetsX_size * widths_size;
|
||||
|
||||
Dtype _boxWidth, _boxHeight;
|
||||
float _boxWidth, _boxHeight;
|
||||
Dtype4 vec;
|
||||
for (int i = 0; i < widths_size; ++i)
|
||||
{
|
||||
@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
|
||||
_boxHeight = _heights[i];
|
||||
for (int j = 0; j < offsetsX_size; ++j)
|
||||
{
|
||||
float center_x = (w + _offsetsX[j]) * stepX;
|
||||
float center_y = (h + _offsetsY[j]) * stepY;
|
||||
Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
|
||||
Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
|
||||
|
||||
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
|
||||
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
|
||||
@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
|
||||
__kernel void set_variance(const int nthreads,
|
||||
const int offset,
|
||||
const int variance_size,
|
||||
__global const Dtype* variance,
|
||||
__global const float* variance,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
|
||||
if (variance_size == 1)
|
||||
var_vec = (Dtype4)(variance[0]);
|
||||
else
|
||||
var_vec = vload4(0, variance);
|
||||
var_vec = convert_T(vload4(0, variance));
|
||||
|
||||
vstore4(var_vec, 0, dst + offset + index * 4);
|
||||
}
|
||||
|
@ -39,6 +39,10 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void reorg(const int count,
|
||||
__global const Dtype* src,
|
||||
const int channels,
|
||||
|
@ -40,9 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void slice(__global const Dtype* src,
|
||||
const int src_plane_size,
|
||||
|
@ -24,6 +24,10 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void kernel_channel_max(const int num, const int channels,
|
||||
const int spatial_dim, __global const T* data, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
|
||||
|
||||
__kernel void kernel_channel_subtract(const int count,
|
||||
const int num, const int channels,
|
||||
const int spatial_dim, __global const T* channel_max, __global T* data) {
|
||||
const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count) {
|
||||
int n = index / channels / spatial_dim;
|
||||
int s = index % spatial_dim;
|
||||
data[index] -= channel_max[n * spatial_dim + s];
|
||||
data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -42,12 +42,15 @@
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
|
||||
const int spatial_dim,
|
||||
__global Dtype* scale,
|
||||
@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
|
||||
int n = get_global_id(1);
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
float maxval = -FLT_MAX;
|
||||
Dtype maxval = -DTYPE_MAX;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
|
||||
maxval = max((Dtype)tmp, (Dtype)maxval);
|
||||
}
|
||||
maxval = sub_group_reduce_max(maxval * 100000);
|
||||
maxval = sub_group_reduce_max(maxval);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
|
||||
}
|
||||
@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale_tmp[s] = maxval / 100000;
|
||||
scale_tmp[s] = maxval;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
sum += out_tmp[c * spatial_dim + s];
|
||||
}
|
||||
sum = sub_group_reduce_add(sum * 100000);
|
||||
sum = sub_group_reduce_add(sum);
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale_tmp[s] = sum / 100000;
|
||||
scale_tmp[s] = sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
|
||||
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
float maxval = -FLT_MAX;
|
||||
Dtype maxval = -DTYPE_MAX;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
|
||||
maxval = max((Dtype)tmp, (Dtype)maxval);
|
||||
}
|
||||
maxval = sub_group_reduce_max(maxval * 100000);
|
||||
maxval = sub_group_reduce_max(maxval);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
|
||||
}
|
||||
@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale[n * spatial_dim + s] = maxval / 100000;
|
||||
scale[n * spatial_dim + s] = maxval;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
sum += out[n * channels * spatial_dim + c * spatial_dim + s];
|
||||
}
|
||||
sum = sub_group_reduce_add(sum * 100000);
|
||||
sum = sub_group_reduce_add(sum);
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale[n * spatial_dim + s] = sum / 100000;
|
||||
scale[n * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
|
@ -64,6 +64,7 @@
|
||||
|
||||
namespace cv { namespace dnn {
|
||||
CV__DNN_EXPERIMENTAL_NS_BEGIN
|
||||
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
|
||||
Mutex& getInitializationMutex();
|
||||
void initializeLayerFactory();
|
||||
CV__DNN_EXPERIMENTAL_NS_END
|
||||
|
@ -538,6 +538,37 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// In case of resizing by factor.
|
||||
class ResizeBilinearSubgraph : public Subgraph
|
||||
{
|
||||
public:
|
||||
ResizeBilinearSubgraph()
|
||||
{
|
||||
int input = addNodeToMatch("");
|
||||
|
||||
int shape = addNodeToMatch("Shape", input);
|
||||
int stack = addNodeToMatch("Const");
|
||||
int stack_1 = addNodeToMatch("Const");
|
||||
int stack_2 = addNodeToMatch("Const");
|
||||
int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
|
||||
int factorY = addNodeToMatch("Const");
|
||||
int mul = addNodeToMatch("Mul", strided_slice, factorY);
|
||||
|
||||
shape = addNodeToMatch("Shape", input);
|
||||
stack = addNodeToMatch("Const");
|
||||
stack_1 = addNodeToMatch("Const");
|
||||
stack_2 = addNodeToMatch("Const");
|
||||
strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
|
||||
int factorX = addNodeToMatch("Const");
|
||||
int mul_1 = addNodeToMatch("Mul", strided_slice, factorX);
|
||||
|
||||
int pack = addNodeToMatch("Pack", mul, mul_1);
|
||||
|
||||
addNodeToMatch("ResizeBilinear", input, pack);
|
||||
setFusedNode("ResizeBilinear", input, factorY, factorX);
|
||||
}
|
||||
};
|
||||
|
||||
void simplifySubgraphs(tensorflow::GraphDef& net)
|
||||
{
|
||||
std::vector<Ptr<Subgraph> > subgraphs;
|
||||
@ -551,6 +582,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
|
||||
subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
|
||||
subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
|
||||
subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));
|
||||
subgraphs.push_back(Ptr<Subgraph>(new ResizeBilinearSubgraph()));
|
||||
|
||||
int numNodes = net.node_size();
|
||||
std::vector<int> matchedNodesIds;
|
||||
|
@ -767,6 +767,26 @@ void TFImporter::populateNet(Net dstNet)
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (type == "Sub")
|
||||
{
|
||||
bool haveConst = false;
|
||||
for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
|
||||
{
|
||||
Pin input = parsePin(layer.input(ii));
|
||||
haveConst = value_id.find(input.name) != value_id.end();
|
||||
}
|
||||
CV_Assert(haveConst);
|
||||
|
||||
layerParams.blobs.resize(1);
|
||||
blobFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]);
|
||||
layerParams.blobs[0] *= -1;
|
||||
|
||||
int id = dstNet.addLayer(name, "Shift", layerParams);
|
||||
layer_id[name] = id;
|
||||
|
||||
// one input only
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
|
||||
}
|
||||
else if (type == "MatMul")
|
||||
{
|
||||
CV_Assert(layer.input_size() == 2);
|
||||
|
@ -147,7 +147,9 @@ TEST_P(DNNTestNetwork, Inception_5h)
|
||||
|
||||
TEST_P(DNNTestNetwork, ENet)
|
||||
{
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
|
||||
if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
|
||||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
|
||||
throw SkipTestException("");
|
||||
processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
|
||||
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
|
||||
"dnn/halide_scheduler_enet.yml",
|
||||
@ -161,9 +163,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
|
||||
throw SkipTestException("");
|
||||
Mat sample = imread(findDataFile("dnn/street.png", false));
|
||||
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
|
||||
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.0007 : 0.0;
|
||||
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.011 : 0.0;
|
||||
|
||||
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
|
||||
inp, "detection_out");
|
||||
inp, "detection_out", "", l1, lInf);
|
||||
}
|
||||
|
||||
TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
|
||||
@ -173,15 +177,17 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
|
||||
throw SkipTestException("");
|
||||
Mat sample = imread(findDataFile("dnn/street.png", false));
|
||||
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
|
||||
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
|
||||
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.06 : 0.0;
|
||||
processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
|
||||
inp, "detection_out");
|
||||
inp, "detection_out", "", l1, lInf);
|
||||
}
|
||||
|
||||
TEST_P(DNNTestNetwork, SSD_VGG16)
|
||||
{
|
||||
if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
|
||||
backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU ||
|
||||
backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
|
||||
if ((backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
|
||||
(backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU) ||
|
||||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU))
|
||||
throw SkipTestException("");
|
||||
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
|
||||
"dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
|
||||
@ -236,14 +242,17 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
|
||||
throw SkipTestException("");
|
||||
Mat sample = imread(findDataFile("dnn/street.png", false));
|
||||
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
|
||||
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
|
||||
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.07 : 0.0;
|
||||
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
|
||||
inp, "detection_out");
|
||||
inp, "detection_out", "", l1, lInf);
|
||||
}
|
||||
|
||||
TEST_P(DNNTestNetwork, DenseNet_121)
|
||||
{
|
||||
if (backend == DNN_BACKEND_HALIDE ||
|
||||
backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
|
||||
if ((backend == DNN_BACKEND_HALIDE) ||
|
||||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
|
||||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
|
||||
throw SkipTestException("");
|
||||
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
|
||||
}
|
||||
@ -258,7 +267,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
|
||||
#endif
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
|
||||
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
|
||||
|
@ -104,7 +104,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
|
||||
ASSERT_FALSE(net.empty());
|
||||
}
|
||||
|
||||
net.setPreferableTarget(get<1>(GetParam()));
|
||||
int targetId = get<1>(GetParam());
|
||||
const float l1 = 1e-5;
|
||||
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
|
||||
|
||||
net.setPreferableTarget(targetId);
|
||||
|
||||
Mat sample = imread(_tf("grace_hopper_227.png"));
|
||||
ASSERT_TRUE(!sample.empty());
|
||||
@ -112,10 +116,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
|
||||
net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data");
|
||||
Mat out = net.forward("prob");
|
||||
Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
|
||||
normAssert(ref, out);
|
||||
normAssert(ref, out, "", l1, lInf);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(), availableDnnTargets()));
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(),
|
||||
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)));
|
||||
|
||||
#if !defined(_WIN32) || defined(_WIN64)
|
||||
TEST(Reproducibility_FCN, Accuracy)
|
||||
@ -176,8 +181,11 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
|
||||
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
|
||||
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
|
||||
Net net = readNetFromCaffe(proto, model);
|
||||
int targetId = GetParam();
|
||||
const float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 1.5e-4 : 1e-5;
|
||||
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-4;
|
||||
|
||||
net.setPreferableTarget(GetParam());
|
||||
net.setPreferableTarget(targetId);
|
||||
|
||||
Mat sample = imread(_tf("street.png"));
|
||||
|
||||
@ -185,8 +193,10 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
|
||||
net.setInput(inp);
|
||||
Mat out = net.forward();
|
||||
|
||||
const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
|
||||
const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 5e-3 : 1e-4;
|
||||
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
|
||||
normAssertDetections(ref, out);
|
||||
normAssertDetections(ref, out, "", 0.0, scores_diff, boxes_iou_diff);
|
||||
|
||||
// Check that detections aren't preserved.
|
||||
inp.setTo(0.0f);
|
||||
@ -212,10 +222,12 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
|
||||
// a single sample in batch. The first numbers of detection vectors are batch id.
|
||||
outBatch = outBatch.reshape(1, outBatch.total() / 7);
|
||||
EXPECT_EQ(outBatch.rows, 2 * numDetections);
|
||||
normAssert(outBatch.rowRange(0, numDetections), ref);
|
||||
normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7));
|
||||
normAssert(outBatch.rowRange(0, numDetections), ref, "", l1, lInf);
|
||||
normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7),
|
||||
"", l1, lInf);
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, availableDnnTargets());
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
|
||||
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
|
||||
|
||||
typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
|
||||
TEST_P(Reproducibility_ResNet50, Accuracy)
|
||||
@ -226,6 +238,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
|
||||
int targetId = GetParam();
|
||||
net.setPreferableTarget(targetId);
|
||||
|
||||
float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
|
||||
float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
|
||||
|
||||
Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
|
||||
ASSERT_TRUE(!input.empty());
|
||||
|
||||
@ -233,20 +248,21 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
|
||||
Mat out = net.forward();
|
||||
|
||||
Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
|
||||
normAssert(ref, out);
|
||||
normAssert(ref, out, "", l1, lInf);
|
||||
|
||||
if (targetId == DNN_TARGET_OPENCL)
|
||||
if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
UMat out_umat;
|
||||
net.forward(out_umat);
|
||||
normAssert(ref, out_umat, "out_umat");
|
||||
normAssert(ref, out_umat, "out_umat", l1, lInf);
|
||||
|
||||
std::vector<UMat> out_umats;
|
||||
net.forward(out_umats);
|
||||
normAssert(ref, out_umats[0], "out_umat_vector");
|
||||
normAssert(ref, out_umats[0], "out_umat_vector", l1, lInf);
|
||||
}
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, availableDnnTargets());
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
|
||||
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
|
||||
|
||||
typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
|
||||
TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
|
||||
|
@ -295,26 +295,32 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
|
||||
|
||||
typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_fp16;
|
||||
|
||||
TEST_P(Test_TensorFlow_fp16, tests)
|
||||
{
|
||||
int targetId = GetParam();
|
||||
const float l1 = 7e-4;
|
||||
const float lInf = 1e-2;
|
||||
runTensorFlowNet("fp16_single_conv", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_deconvolution", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_odd_same", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_padding_valid", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_eltwise_add_mul", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_odd_valid", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_pad_and_concat", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_even", targetId, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_padding_same", targetId, false, l1, lInf);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_fp16,
|
||||
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
|
||||
|
||||
TEST(Test_TensorFlow, defun)
|
||||
{
|
||||
runTensorFlowNet("defun_dropout");
|
||||
}
|
||||
|
||||
TEST(Test_TensorFlow, fp16)
|
||||
{
|
||||
const float l1 = 1e-3;
|
||||
const float lInf = 1e-2;
|
||||
runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf);
|
||||
runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf);
|
||||
}
|
||||
|
||||
TEST(Test_TensorFlow, quantized)
|
||||
{
|
||||
runTensorFlowNet("uint8_single_conv");
|
||||
@ -373,9 +379,24 @@ public:
|
||||
ResizeBilinearLayer(const LayerParams ¶ms) : Layer(params)
|
||||
{
|
||||
CV_Assert(!params.get<bool>("align_corners", false));
|
||||
CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1);
|
||||
outHeight = blobs[0].at<int>(0, 0);
|
||||
outWidth = blobs[0].at<int>(0, 1);
|
||||
CV_Assert(!blobs.empty());
|
||||
|
||||
for (size_t i = 0; i < blobs.size(); ++i)
|
||||
CV_Assert(blobs[i].type() == CV_32SC1);
|
||||
|
||||
if (blobs.size() == 1)
|
||||
{
|
||||
CV_Assert(blobs[0].total() == 2);
|
||||
outHeight = blobs[0].at<int>(0, 0);
|
||||
outWidth = blobs[0].at<int>(0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1);
|
||||
factorHeight = blobs[0].at<int>(0, 0);
|
||||
factorWidth = blobs[1].at<int>(0, 0);
|
||||
outHeight = outWidth = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static Ptr<Layer> create(LayerParams& params)
|
||||
@ -391,12 +412,21 @@ public:
|
||||
std::vector<int> outShape(4);
|
||||
outShape[0] = inputs[0][0]; // batch size
|
||||
outShape[1] = inputs[0][1]; // number of channels
|
||||
outShape[2] = outHeight;
|
||||
outShape[3] = outWidth;
|
||||
outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
|
||||
outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
|
||||
outputs.assign(1, outShape);
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
|
||||
{
|
||||
if (!outWidth && !outHeight)
|
||||
{
|
||||
outHeight = outputs[0].size[2];
|
||||
outWidth = outputs[0].size[3];
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation is based on a reference implementation from
|
||||
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
|
||||
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
|
||||
@ -447,13 +477,51 @@ private:
|
||||
return x + size[3] * (y + size[2] * (c + size[1] * b));
|
||||
}
|
||||
|
||||
int outWidth, outHeight;
|
||||
int outWidth, outHeight, factorWidth, factorHeight;
|
||||
};
|
||||
|
||||
TEST(Test_TensorFlow, resize_bilinear)
|
||||
{
|
||||
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
|
||||
runTensorFlowNet("resize_bilinear");
|
||||
runTensorFlowNet("resize_bilinear_factor");
|
||||
LayerFactory::unregisterLayer("ResizeBilinear");
|
||||
}
|
||||
|
||||
// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
|
||||
// inp = inp[:,:,[2, 1, 0]].astype(np.float32).reshape(1, 512, 512, 3)
|
||||
// outs = sess.run([sess.graph.get_tensor_by_name('feature_fusion/Conv_7/Sigmoid:0'),
|
||||
// sess.graph.get_tensor_by_name('feature_fusion/concat_3:0')],
|
||||
// feed_dict={'input_images:0': inp})
|
||||
// scores = np.ascontiguousarray(outs[0].transpose(0, 3, 1, 2))
|
||||
// geometry = np.ascontiguousarray(outs[1].transpose(0, 3, 1, 2))
|
||||
// np.save('east_text_detection.scores.npy', scores)
|
||||
// np.save('east_text_detection.geometry.npy', geometry)
|
||||
TEST(Test_TensorFlow, EAST_text_detection)
|
||||
{
|
||||
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
|
||||
std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
|
||||
std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
|
||||
std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
|
||||
std::string refGeometryPath = findDataFile("dnn/east_text_detection.geometry.npy", false);
|
||||
|
||||
Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
|
||||
|
||||
Mat img = imread(imgPath);
|
||||
Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
|
||||
net.setInput(inp);
|
||||
|
||||
std::vector<Mat> outs;
|
||||
std::vector<String> outNames(2);
|
||||
outNames[0] = "feature_fusion/Conv_7/Sigmoid";
|
||||
outNames[1] = "feature_fusion/concat_3";
|
||||
net.forward(outs, outNames);
|
||||
|
||||
Mat scores = outs[0];
|
||||
Mat geometry = outs[1];
|
||||
|
||||
normAssert(scores, blobFromNPY(refScoresPath), "scores");
|
||||
normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 5e-5, 1e-3);
|
||||
LayerFactory::unregisterLayer("ResizeBilinear");
|
||||
}
|
||||
|
||||
|
@ -503,7 +503,7 @@ namespace cv{
|
||||
// +-+-+-+
|
||||
// |p|q|r|
|
||||
// +-+-+-+
|
||||
// |x|
|
||||
// |x|
|
||||
// +-+
|
||||
const int w = imgLabels.cols, h = imgLabels.rows;
|
||||
|
||||
@ -548,7 +548,7 @@ namespace cv{
|
||||
// +-+-+-+
|
||||
// |-|q|-|
|
||||
// +-+-+-+
|
||||
// |x|
|
||||
// |x|
|
||||
// +-+
|
||||
const int w = imgLabels.cols, h = imgLabels.rows;
|
||||
|
||||
@ -2473,9 +2473,9 @@ namespace cv{
|
||||
// |P -|Q -|R -|
|
||||
// |- -|- -|- -|
|
||||
// +---+---+---+
|
||||
// |X -|
|
||||
// |- -|
|
||||
// +---+
|
||||
// |X -|
|
||||
// |- -|
|
||||
// +---+
|
||||
const int w = imgLabels.cols, h = imgLabels.rows;
|
||||
|
||||
for (int r = chunksSizeAndLabels[0]; r < h; r = chunksSizeAndLabels[r]){
|
||||
|
@ -219,13 +219,15 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
|
||||
}
|
||||
}
|
||||
|
||||
// Get rid of dupes
|
||||
// Get rid of dupes and order points.
|
||||
for( int i = 0; i < (int)intersection.size()-1; i++ )
|
||||
{
|
||||
float dx1 = intersection[i + 1].x - intersection[i].x;
|
||||
float dy1 = intersection[i + 1].y - intersection[i].y;
|
||||
for( size_t j = i+1; j < intersection.size(); j++ )
|
||||
{
|
||||
float dx = intersection[i].x - intersection[j].x;
|
||||
float dy = intersection[i].y - intersection[j].y;
|
||||
float dx = intersection[j].x - intersection[i].x;
|
||||
float dy = intersection[j].y - intersection[i].y;
|
||||
double d2 = dx*dx + dy*dy; // can be a really small number, need double here
|
||||
|
||||
if( d2 < samePointEps*samePointEps )
|
||||
@ -235,6 +237,12 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
|
||||
intersection.pop_back();
|
||||
j--; // restart check
|
||||
}
|
||||
else if (dx1 * dy - dy1 * dx < 0)
|
||||
{
|
||||
std::swap(intersection[i + 1], intersection[j]);
|
||||
dx1 = dx;
|
||||
dy1 = dy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,8 +66,27 @@ private:
|
||||
void test7();
|
||||
void test8();
|
||||
void test9();
|
||||
void test10();
|
||||
void test11();
|
||||
void test12();
|
||||
void test13();
|
||||
void test14();
|
||||
};
|
||||
|
||||
static void compare(const std::vector<Point2f>& test, const std::vector<Point2f>& target)
|
||||
{
|
||||
ASSERT_EQ(test.size(), target.size());
|
||||
ASSERT_TRUE(test.size() < 4 || isContourConvex(test));
|
||||
ASSERT_TRUE(target.size() < 4 || isContourConvex(target));
|
||||
for( size_t i = 0; i < test.size(); i++ )
|
||||
{
|
||||
double dx = test[i].x - target[i].x;
|
||||
double dy = test[i].y - target[i].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
ASSERT_LT(r, ACCURACY);
|
||||
}
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::run(int)
|
||||
{
|
||||
// See pics/intersection.png for the scenarios we are testing
|
||||
@ -92,28 +111,20 @@ void CV_RotatedRectangleIntersectionTest::run(int)
|
||||
test7();
|
||||
test8();
|
||||
test9();
|
||||
test10();
|
||||
test11();
|
||||
test12();
|
||||
test13();
|
||||
test14();
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test1()
|
||||
{
|
||||
// no intersection
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 12.0f;
|
||||
|
||||
rect2.center.x = 10;
|
||||
rect2.center.y = 10;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 34.0f;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 12.0f);
|
||||
RotatedRect rect2(Point2f(10, 10), Size2f(2, 2), 34.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_NONE);
|
||||
@ -123,375 +134,243 @@ void CV_RotatedRectangleIntersectionTest::test1()
|
||||
void CV_RotatedRectangleIntersectionTest::test2()
|
||||
{
|
||||
// partial intersection, rectangles translated
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 1;
|
||||
rect2.center.y = 1;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 0;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(1, 1), Size2f(2, 2), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 4);
|
||||
|
||||
vector<Point2f> possibleVertices(4);
|
||||
|
||||
possibleVertices[0] = Point2f(0.0f, 0.0f);
|
||||
possibleVertices[1] = Point2f(1.0f, 1.0f);
|
||||
possibleVertices[2] = Point2f(0.0f, 1.0f);
|
||||
possibleVertices[3] = Point2f(1.0f, 0.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(1.0f, 0.0f);
|
||||
targetVertices[1] = Point2f(1.0f, 1.0f);
|
||||
targetVertices[2] = Point2f(0.0f, 1.0f);
|
||||
targetVertices[3] = Point2f(0.0f, 0.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test3()
|
||||
{
|
||||
// partial intersection, rectangles rotated 45 degree on the corner, forms a triangle intersection
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 1;
|
||||
rect2.center.y = 1;
|
||||
rect2.size.width = sqrt(2.0f);
|
||||
rect2.size.height = 20;
|
||||
rect2.angle = 45.0f;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(1, 1), Size2f(sqrt(2.0f), 20), 45.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 3);
|
||||
|
||||
vector<Point2f> possibleVertices(3);
|
||||
|
||||
possibleVertices[0] = Point2f(1.0f, 1.0f);
|
||||
possibleVertices[1] = Point2f(0.0f, 1.0f);
|
||||
possibleVertices[2] = Point2f(1.0f, 0.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(3);
|
||||
targetVertices[0] = Point2f(1.0f, 0.0f);
|
||||
targetVertices[1] = Point2f(1.0f, 1.0f);
|
||||
targetVertices[2] = Point2f(0.0f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test4()
|
||||
{
|
||||
// full intersection, rectangles of same size directly on top of each other
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 0;
|
||||
rect2.center.y = 0;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 0;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_FULL);
|
||||
CV_Assert(vertices.size() == 4);
|
||||
|
||||
vector<Point2f> possibleVertices(4);
|
||||
|
||||
possibleVertices[0] = Point2f(-1.0f, 1.0f);
|
||||
possibleVertices[1] = Point2f(1.0f, -1.0f);
|
||||
possibleVertices[2] = Point2f(-1.0f, -1.0f);
|
||||
possibleVertices[3] = Point2f(1.0f, 1.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(-1.0f, 1.0f);
|
||||
targetVertices[1] = Point2f(-1.0f, -1.0f);
|
||||
targetVertices[2] = Point2f(1.0f, -1.0f);
|
||||
targetVertices[3] = Point2f(1.0f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test5()
|
||||
{
|
||||
// partial intersection, rectangle on top rotated 45 degrees
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 0;
|
||||
rect2.center.y = 0;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 45.0f;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 45.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 8);
|
||||
|
||||
vector<Point2f> possibleVertices(8);
|
||||
|
||||
possibleVertices[0] = Point2f(-1.0f, -0.414214f);
|
||||
possibleVertices[1] = Point2f(-1.0f, 0.414214f);
|
||||
possibleVertices[2] = Point2f(-0.414214f, -1.0f);
|
||||
possibleVertices[3] = Point2f(0.414214f, -1.0f);
|
||||
possibleVertices[4] = Point2f(1.0f, -0.414214f);
|
||||
possibleVertices[5] = Point2f(1.0f, 0.414214f);
|
||||
possibleVertices[6] = Point2f(0.414214f, 1.0f);
|
||||
possibleVertices[7] = Point2f(-0.414214f, 1.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(8);
|
||||
targetVertices[0] = Point2f(-1.0f, -0.414214f);
|
||||
targetVertices[1] = Point2f(-0.414214f, -1.0f);
|
||||
targetVertices[2] = Point2f(0.414214f, -1.0f);
|
||||
targetVertices[3] = Point2f(1.0f, -0.414214f);
|
||||
targetVertices[4] = Point2f(1.0f, 0.414214f);
|
||||
targetVertices[5] = Point2f(0.414214f, 1.0f);
|
||||
targetVertices[6] = Point2f(-0.414214f, 1.0f);
|
||||
targetVertices[7] = Point2f(-1.0f, 0.414214f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test6()
|
||||
{
|
||||
// 6 - partial intersection, rectangle on top of different size
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 0;
|
||||
rect2.center.y = 0;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 10;
|
||||
rect2.angle = 0;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0), Size2f(2, 10), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 4);
|
||||
|
||||
vector<Point2f> possibleVertices(4);
|
||||
|
||||
possibleVertices[0] = Point2f(1.0f, 1.0f);
|
||||
possibleVertices[1] = Point2f(1.0f, -1.0f);
|
||||
possibleVertices[2] = Point2f(-1.0f, -1.0f);
|
||||
possibleVertices[3] = Point2f(-1.0f, 1.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(-1.0f, -1.0f);
|
||||
targetVertices[1] = Point2f(1.0f, -1.0f);
|
||||
targetVertices[2] = Point2f(1.0f, 1.0f);
|
||||
targetVertices[3] = Point2f(-1.0f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test7()
|
||||
{
|
||||
// full intersection, rectangle fully enclosed in the other
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 12.34f;
|
||||
rect1.size.height = 56.78f;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 0;
|
||||
rect2.center.y = 0;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 0;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(12.34f, 56.78f), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_FULL);
|
||||
CV_Assert(vertices.size() == 4);
|
||||
|
||||
vector<Point2f> possibleVertices(4);
|
||||
|
||||
possibleVertices[0] = Point2f(1.0f, 1.0f);
|
||||
possibleVertices[1] = Point2f(1.0f, -1.0f);
|
||||
possibleVertices[2] = Point2f(-1.0f, -1.0f);
|
||||
possibleVertices[3] = Point2f(-1.0f, 1.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
}
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(-1.0f, 1.0f);
|
||||
targetVertices[1] = Point2f(-1.0f, -1.0f);
|
||||
targetVertices[2] = Point2f(1.0f, -1.0f);
|
||||
targetVertices[3] = Point2f(1.0f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test8()
|
||||
{
|
||||
// full intersection, rectangle fully enclosed in the other
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 2;
|
||||
rect2.center.y = 2;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 2;
|
||||
rect2.angle = 0;
|
||||
// intersection by a single vertex
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(2, 2), Size2f(2, 2), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 1);
|
||||
|
||||
double dx = vertices[0].x - 1;
|
||||
double dy = vertices[0].y - 1;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
CV_Assert(r < ACCURACY);
|
||||
compare(vertices, vector<Point2f>(1, Point2f(1.0f, 1.0f)));
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test9()
|
||||
{
|
||||
// full intersection, rectangle fully enclosed in the other
|
||||
|
||||
RotatedRect rect1, rect2;
|
||||
|
||||
rect1.center.x = 0;
|
||||
rect1.center.y = 0;
|
||||
rect1.size.width = 2;
|
||||
rect1.size.height = 2;
|
||||
rect1.angle = 0;
|
||||
|
||||
rect2.center.x = 2;
|
||||
rect2.center.y = 0;
|
||||
rect2.size.width = 2;
|
||||
rect2.size.height = 123.45f;
|
||||
rect2.angle = 0;
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(2, 0), Size2f(2, 123.45f), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
CV_Assert(vertices.size() == 2);
|
||||
|
||||
vector<Point2f> possibleVertices(2);
|
||||
vector<Point2f> targetVertices(2);
|
||||
targetVertices[0] = Point2f(1.0f, -1.0f);
|
||||
targetVertices[1] = Point2f(1.0f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
possibleVertices[0] = Point2f(1.0f, 1.0f);
|
||||
possibleVertices[1] = Point2f(1.0f, -1.0f);
|
||||
void CV_RotatedRectangleIntersectionTest::test10()
|
||||
{
|
||||
// three points of rect2 are inside rect1.
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0.5), Size2f(1, 1), 45.0f);
|
||||
|
||||
for( size_t i = 0; i < vertices.size(); i++ )
|
||||
vector<Point2f> vertices;
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
|
||||
vector<Point2f> targetVertices(5);
|
||||
targetVertices[0] = Point2f(0.207107f, 1.0f);
|
||||
targetVertices[1] = Point2f(-0.207107f, 1.0f);
|
||||
targetVertices[2] = Point2f(-0.707107f, 0.5f);
|
||||
targetVertices[3] = Point2f(0.0f, -0.207107f);
|
||||
targetVertices[4] = Point2f(0.707107f, 0.5f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test11()
|
||||
{
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(4, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), -45.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
|
||||
vector<Point2f> targetVertices(6);
|
||||
targetVertices[0] = Point2f(-0.414214f, -1.0f);
|
||||
targetVertices[1] = Point2f(0.414213f, -1.0f);
|
||||
targetVertices[2] = Point2f(1.41421f, 0.0f);
|
||||
targetVertices[3] = Point2f(0.414214f, 1.0f);
|
||||
targetVertices[4] = Point2f(-0.414213f, 1.0f);
|
||||
targetVertices[5] = Point2f(-1.41421f, 0.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test12()
|
||||
{
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 1), Size2f(1, 1), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(-0.5f, 1.0f);
|
||||
targetVertices[1] = Point2f(-0.5f, 0.5f);
|
||||
targetVertices[2] = Point2f(0.5f, 0.5f);
|
||||
targetVertices[3] = Point2f(0.5f, 1.0f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test13()
|
||||
{
|
||||
RotatedRect rect1(Point2f(0, 0), Size2f(1, 3), 0.0f);
|
||||
RotatedRect rect2(Point2f(0, 1), Size2f(3, 1), 0.0f);
|
||||
|
||||
vector<Point2f> vertices;
|
||||
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
|
||||
|
||||
CV_Assert(ret == INTERSECT_PARTIAL);
|
||||
|
||||
vector<Point2f> targetVertices(4);
|
||||
targetVertices[0] = Point2f(-0.5f, 0.5f);
|
||||
targetVertices[1] = Point2f(0.5f, 0.5f);
|
||||
targetVertices[2] = Point2f(0.5f, 1.5f);
|
||||
targetVertices[3] = Point2f(-0.5f, 1.5f);
|
||||
compare(vertices, targetVertices);
|
||||
}
|
||||
|
||||
void CV_RotatedRectangleIntersectionTest::test14()
|
||||
{
|
||||
const int kNumTests = 100;
|
||||
const int kWidth = 5;
|
||||
const int kHeight = 5;
|
||||
RotatedRect rects[2];
|
||||
std::vector<Point2f> inter;
|
||||
for (int i = 0; i < kNumTests; ++i)
|
||||
{
|
||||
double bestR = DBL_MAX;
|
||||
|
||||
for( size_t j = 0; j < possibleVertices.size(); j++ )
|
||||
for (int j = 0; j < 2; ++j)
|
||||
{
|
||||
double dx = vertices[i].x - possibleVertices[j].x;
|
||||
double dy = vertices[i].y - possibleVertices[j].y;
|
||||
double r = sqrt(dx*dx + dy*dy);
|
||||
|
||||
bestR = std::min(bestR, r);
|
||||
rects[j].center = Point2f((float)(rand() % kWidth), (float)(rand() % kHeight));
|
||||
rects[j].size = Size2f(rand() % kWidth + 1.0f, rand() % kHeight + 1.0f);
|
||||
rects[j].angle = (float)(rand() % 360);
|
||||
}
|
||||
|
||||
CV_Assert(bestR < ACCURACY);
|
||||
rotatedRectangleIntersection(rects[0], rects[1], inter);
|
||||
ASSERT_TRUE(inter.size() < 4 || isContourConvex(inter));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -420,4 +420,18 @@ void CV_ThreshTest::prepare_to_validation( int /*test_case_idx*/ )
|
||||
|
||||
TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); }
|
||||
|
||||
BIGDATA_TEST(Imgproc_Threshold, huge)
|
||||
{
|
||||
Mat m(65000, 40000, CV_8U);
|
||||
ASSERT_FALSE(m.isContinuous());
|
||||
|
||||
uint64 i, n = (uint64)m.rows*m.cols;
|
||||
for( i = 0; i < n; i++ )
|
||||
m.data[i] = (uchar)(i & 255);
|
||||
|
||||
cv::threshold(m, m, 127, 255, cv::THRESH_BINARY);
|
||||
int nz = cv::countNonZero(m); // FIXIT 'int' is not enough here (overflow is possible with other inputs)
|
||||
ASSERT_EQ((uint64)nz, n / 2);
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
|
@ -251,13 +251,15 @@ void Cloning::initVariables(const Mat &destination, const Mat &binaryMask)
|
||||
//init of the filters used in the dst
|
||||
const int w = destination.cols;
|
||||
filter_X.resize(w - 2);
|
||||
double scale = CV_PI / (w - 1);
|
||||
for(int i = 0 ; i < w-2 ; ++i)
|
||||
filter_X[i] = 2.0f * std::cos(static_cast<float>(CV_PI) * (i + 1) / (w - 1));
|
||||
filter_X[i] = 2.0f * (float)std::cos(scale * (i + 1));
|
||||
|
||||
const int h = destination.rows;
|
||||
filter_Y.resize(h - 2);
|
||||
scale = CV_PI / (h - 1);
|
||||
for(int j = 0 ; j < h - 2 ; ++j)
|
||||
filter_Y[j] = 2.0f * std::cos(static_cast<float>(CV_PI) * (j + 1) / (h - 1));
|
||||
filter_Y[j] = 2.0f * (float)std::cos(scale * (j + 1));
|
||||
}
|
||||
|
||||
void Cloning::computeDerivatives(const Mat& destination, const Mat &patch, const Mat &binaryMask)
|
||||
|
@ -53,7 +53,7 @@ namespace opencv_test { namespace {
|
||||
#define SAVE(x)
|
||||
#endif
|
||||
|
||||
static const double numerical_precision = 1000.;
|
||||
static const double numerical_precision = 0.05; // 95% of pixels should have exact values
|
||||
|
||||
TEST(Photo_SeamlessClone_normal, regression)
|
||||
{
|
||||
@ -82,8 +82,10 @@ TEST(Photo_SeamlessClone_normal, regression)
|
||||
|
||||
SAVE(result);
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
TEST(Photo_SeamlessClone_mixed, regression)
|
||||
@ -113,9 +115,10 @@ TEST(Photo_SeamlessClone_mixed, regression)
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
TEST(Photo_SeamlessClone_featureExchange, regression)
|
||||
@ -145,9 +148,10 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
TEST(Photo_SeamlessClone_colorChange, regression)
|
||||
@ -171,9 +175,10 @@ TEST(Photo_SeamlessClone_colorChange, regression)
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
TEST(Photo_SeamlessClone_illuminationChange, regression)
|
||||
@ -195,9 +200,12 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
|
||||
SAVE(result);
|
||||
|
||||
Mat reference = imread(reference_path);
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
TEST(Photo_SeamlessClone_textureFlattening, regression)
|
||||
@ -221,9 +229,10 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
|
||||
Mat reference = imread(reference_path);
|
||||
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
|
||||
|
||||
double error = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(error, numerical_precision);
|
||||
|
||||
double errorINF = cvtest::norm(reference, result, NORM_INF);
|
||||
EXPECT_LE(errorINF, 1);
|
||||
double errorL1 = cvtest::norm(reference, result, NORM_L1);
|
||||
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
|
@ -661,7 +661,7 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
|
||||
}
|
||||
|
||||
// Set destination Mats to 0 so new image can be blended
|
||||
for (size_t i = 0; i < num_bands_ + 1; ++i)
|
||||
for (size_t i = 0; i < (size_t)(num_bands_ + 1); ++i)
|
||||
{
|
||||
gpu_dst_band_weights_[i].setTo(0);
|
||||
gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
|
||||
|
@ -11,6 +11,7 @@
|
||||
namespace cvtest {
|
||||
void checkIppStatus();
|
||||
extern bool skipUnstableTests;
|
||||
extern bool runBigDataTests;
|
||||
extern int testThreads;
|
||||
}
|
||||
|
||||
@ -43,7 +44,7 @@ extern int testThreads;
|
||||
|
||||
|
||||
#undef TEST
|
||||
#define TEST(test_case_name, test_name) \
|
||||
#define TEST_(test_case_name, test_name, BODY_IMPL) \
|
||||
class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\
|
||||
public:\
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
|
||||
@ -65,9 +66,37 @@ extern int testThreads;
|
||||
::testing::Test::TearDownTestCase, \
|
||||
new ::testing::internal::TestFactoryImpl<\
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
|
||||
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() CV__TEST_BODY_IMPL( #test_case_name "_" #test_name ) \
|
||||
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
|
||||
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body()
|
||||
|
||||
#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL)
|
||||
|
||||
#define CV__TEST_BIGDATA_BODY_IMPL(name) \
|
||||
{ \
|
||||
if (!cvtest::runBigDataTests) \
|
||||
{ \
|
||||
printf("[ SKIP ] BigData tests are disabled\n"); \
|
||||
return; \
|
||||
} \
|
||||
CV__TRACE_APP_FUNCTION_NAME(name); \
|
||||
try { \
|
||||
CV__TEST_INIT \
|
||||
Body(); \
|
||||
CV__TEST_CLEANUP \
|
||||
} \
|
||||
catch (cvtest::SkipTestException& e) \
|
||||
{ \
|
||||
printf("[ SKIP ] %s\n", e.what()); \
|
||||
} \
|
||||
} \
|
||||
|
||||
// Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb)
|
||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__)
|
||||
#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL)
|
||||
#else
|
||||
#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL)
|
||||
#endif
|
||||
|
||||
#undef TEST_F
|
||||
#define TEST_F(test_fixture, test_name)\
|
||||
class GTEST_TEST_CLASS_NAME_(test_fixture, test_name) : public test_fixture {\
|
||||
|
@ -699,6 +699,7 @@ void checkIppStatus()
|
||||
}
|
||||
|
||||
bool skipUnstableTests = false;
|
||||
bool runBigDataTests = false;
|
||||
int testThreads = 0;
|
||||
|
||||
void parseCustomOptions(int argc, char **argv)
|
||||
@ -708,6 +709,7 @@ void parseCustomOptions(int argc, char **argv)
|
||||
"{ test_seed |809564 |seed for random numbers generator }"
|
||||
"{ test_threads |-1 |the number of worker threads, if parallel execution is enabled}"
|
||||
"{ skip_unstable |false |skip unstable tests }"
|
||||
"{ test_bigdata |false |run BigData tests (>=2Gb) }"
|
||||
"{ h help |false |print help info }";
|
||||
|
||||
cv::CommandLineParser parser(argc, argv, command_line_keys);
|
||||
@ -730,6 +732,7 @@ void parseCustomOptions(int argc, char **argv)
|
||||
testThreads = parser.get<int>("test_threads");
|
||||
|
||||
skipUnstableTests = parser.get<bool>("skip_unstable");
|
||||
runBigDataTests = parser.get<bool>("test_bigdata");
|
||||
}
|
||||
|
||||
|
||||
|