Merge remote-tracking branch 'upstream/3.4' into merge-3.4

This commit is contained in:
Alexander Alekhin 2018-05-21 16:20:14 +03:00
commit db88cd1b25
117 changed files with 4451 additions and 1135 deletions

View File

@ -2,37 +2,37 @@ function(download_ippicv root_var)
set(${root_var} "" PARENT_SCOPE) set(${root_var} "" PARENT_SCOPE)
# Commit SHA in the opencv_3rdparty repo # Commit SHA in the opencv_3rdparty repo
set(IPPICV_COMMIT "dfe3162c237af211e98b8960018b564bc209261d") set(IPPICV_COMMIT "bdb7bb85f34a8cb0d35e40a81f58da431aa1557a")
# Define actual ICV versions # Define actual ICV versions
if(APPLE) if(APPLE)
set(OPENCV_ICV_PLATFORM "macosx") set(OPENCV_ICV_PLATFORM "macosx")
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac") set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
if(X86_64) if(X86_64)
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20170822.tgz") set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20180518.tgz")
set(OPENCV_ICV_HASH "c1ebb5dfa5b7f54b0c44e1917805a463") set(OPENCV_ICV_HASH "3ae52b9be0fe73dd45bc5e9429cd3732")
else() else()
set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20170822.tgz") set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20180518.tgz")
set(OPENCV_ICV_HASH "49b05a669042753ae75895a445ebd612") set(OPENCV_ICV_HASH "698660b975b62bee3ef6c5af51e97544")
endif() endif()
elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86")) elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
set(OPENCV_ICV_PLATFORM "linux") set(OPENCV_ICV_PLATFORM "linux")
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx") set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
if(X86_64) if(X86_64)
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20170822.tgz") set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20180518.tgz")
set(OPENCV_ICV_HASH "4e0352ce96473837b1d671ce87f17359") set(OPENCV_ICV_HASH "b7cc351267db2d34b9efa1cd22ff0572")
else() else()
set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20170822.tgz") set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20180518.tgz")
set(OPENCV_ICV_HASH "dcdb0ba4b123f240596db1840cd59a76") set(OPENCV_ICV_HASH "ea72de74dae3c604eb6348395366e78e")
endif() endif()
elseif(WIN32 AND NOT ARM) elseif(WIN32 AND NOT ARM)
set(OPENCV_ICV_PLATFORM "windows") set(OPENCV_ICV_PLATFORM "windows")
set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win") set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
if(X86_64) if(X86_64)
set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20170822.zip") set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20180518.zip")
set(OPENCV_ICV_HASH "0421e642bc7ad741a2236d3ec4190bdd") set(OPENCV_ICV_HASH "915ff92958089ede8ea532d3c4fe7187")
else() else()
set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20170822.zip") set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20180518.zip")
set(OPENCV_ICV_HASH "8a7680ae352c192de2e2e34936164bd0") set(OPENCV_ICV_HASH "928168c2d99ab284047dfcfb7a821d91")
endif() endif()
else() else()
return() return()

View File

@ -32,11 +32,11 @@ Unspecified error: Can't create layer "layer_name" of type "MyType" in function
To import the model correctly you have to derive a class from cv::dnn::Layer with To import the model correctly you have to derive a class from cv::dnn::Layer with
the following methods: the following methods:
@snippet dnn/custom_layers.cpp A custom layer interface @snippet dnn/custom_layers.hpp A custom layer interface
And register it before the import: And register it before the import:
@snippet dnn/custom_layers.cpp Register a custom layer @snippet dnn/custom_layers.hpp Register a custom layer
@note `MyType` is a type of unimplemented layer from the thrown exception. @note `MyType` is a type of unimplemented layer from the thrown exception.
@ -44,27 +44,27 @@ Let's see what all the methods do:
- Constructor - Constructor
@snippet dnn/custom_layers.cpp MyLayer::MyLayer @snippet dnn/custom_layers.hpp MyLayer::MyLayer
Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable
weights they will be already stored in the Layer's member cv::dnn::Layer::blobs. weights they will be already stored in the Layer's member cv::dnn::Layer::blobs.
- A static method `create` - A static method `create`
@snippet dnn/custom_layers.cpp MyLayer::create @snippet dnn/custom_layers.hpp MyLayer::create
This method should create an instance of you layer and return cv::Ptr with it. This method should create an instance of you layer and return cv::Ptr with it.
- Output blobs' shape computation - Output blobs' shape computation
@snippet dnn/custom_layers.cpp MyLayer::getMemoryShapes @snippet dnn/custom_layers.hpp MyLayer::getMemoryShapes
Returns layer's output shapes depends on input shapes. You may request an extra Returns layer's output shapes depends on input shapes. You may request an extra
memory using `internals`. memory using `internals`.
- Run a layer - Run a layer
@snippet dnn/custom_layers.cpp MyLayer::forward @snippet dnn/custom_layers.hpp MyLayer::forward
Implement a layer's logic here. Compute outputs for given inputs. Implement a layer's logic here. Compute outputs for given inputs.
@ -74,7 +74,7 @@ the second invocation of `forward` will has the same data at `outputs` and `inte
- Optional `finalize` method - Optional `finalize` method
@snippet dnn/custom_layers.cpp MyLayer::finalize @snippet dnn/custom_layers.hpp MyLayer::finalize
The chain of methods are the following: OpenCV deep learning engine calls `create` The chain of methods are the following: OpenCV deep learning engine calls `create`
method once then it calls `getMemoryShapes` for an every created layer then you method once then it calls `getMemoryShapes` for an every created layer then you
@ -108,11 +108,11 @@ layer {
This way our implementation can look like: This way our implementation can look like:
@snippet dnn/custom_layers.cpp InterpLayer @snippet dnn/custom_layers.hpp InterpLayer
Next we need to register a new layer type and try to import the model. Next we need to register a new layer type and try to import the model.
@snippet dnn/custom_layers.cpp Register InterpLayer @snippet dnn/custom_layers.hpp Register InterpLayer
## Example: custom layer from TensorFlow ## Example: custom layer from TensorFlow
This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear) This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear)
@ -185,11 +185,11 @@ Custom layers import from TensorFlow is designed to put all layer's `attr` into
cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs. cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs.
In our case resize's output shape will be stored in layer's `blobs[0]`. In our case resize's output shape will be stored in layer's `blobs[0]`.
@snippet dnn/custom_layers.cpp ResizeBilinearLayer @snippet dnn/custom_layers.hpp ResizeBilinearLayer
Next we register a layer and try to import the model. Next we register a layer and try to import the model.
@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer @snippet dnn/custom_layers.hpp Register ResizeBilinearLayer
## Define a custom layer in Python ## Define a custom layer in Python
The following example shows how to customize OpenCV's layers in Python. The following example shows how to customize OpenCV's layers in Python.

View File

@ -5,6 +5,8 @@ This section contains tutorials about how to use the built-in graphical user int
- @subpage tutorial_trackbar - @subpage tutorial_trackbar
*Languages:* C++, Java, Python
*Compatibility:* \> OpenCV 2.0 *Compatibility:* \> OpenCV 2.0
*Author:* Ana Huamán *Author:* Ana Huamán

View File

@ -1,11 +1,11 @@
Adding a Trackbar to our applications! {#tutorial_trackbar} Adding a Trackbar to our applications! {#tutorial_trackbar}
====================================== ======================================
- In the previous tutorials (about *linear blending* and the *brightness and contrast - In the previous tutorials (about @ref tutorial_adding_images and the @ref tutorial_basic_linear_transform)
adjustments*) you might have noted that we needed to give some **input** to our programs, such you might have noted that we needed to give some **input** to our programs, such
as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal.
- Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (*highgui.hpp*) - Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (**highgui** module)
for you. An example of this is a **Trackbar** for you. An example of this is a **Trackbar**.
![](images/Adding_Trackbars_Tutorial_Trackbar.png) ![](images/Adding_Trackbars_Tutorial_Trackbar.png)
@ -24,26 +24,73 @@ Code
Let's modify the program made in the tutorial @ref tutorial_adding_images. We will let the user enter the Let's modify the program made in the tutorial @ref tutorial_adding_images. We will let the user enter the
\f$\alpha\f$ value by using the Trackbar. \f$\alpha\f$ value by using the Trackbar.
@add_toggle_cpp
This tutorial code's is shown lines below. You can also download it from This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp) [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp)
@include cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp @include cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
@end_toggle
@add_toggle_java
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java)
@include java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
@end_toggle
@add_toggle_python
This tutorial code's is shown lines below. You can also download it from
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py)
@include python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
@end_toggle
Explanation Explanation
----------- -----------
We only analyze the code that is related to Trackbar: We only analyze the code that is related to Trackbar:
-# First, we load two images, which are going to be blended. - First, we load two images, which are going to be blended.
@add_toggle_cpp
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
@end_toggle
-# To create a trackbar, first we have to create the window in which it is going to be located. So: @add_toggle_java
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java load
@end_toggle
@add_toggle_python
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py load
@end_toggle
- To create a trackbar, first we have to create the window in which it is going to be located. So:
@add_toggle_cpp
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
@end_toggle
-# Now we can create the Trackbar: @add_toggle_java
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java window
@end_toggle
@add_toggle_python
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py window
@end_toggle
- Now we can create the Trackbar:
@add_toggle_cpp
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
@end_toggle
Note the following: @add_toggle_java
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java create_trackbar
@end_toggle
@add_toggle_python
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py create_trackbar
@end_toggle
Note the following (C++ code):
- Our Trackbar has a label **TrackbarName** - Our Trackbar has a label **TrackbarName**
- The Trackbar is located in the window named **Linear Blend** - The Trackbar is located in the window named **Linear Blend**
- The Trackbar values will be in the range from \f$0\f$ to **alpha_slider_max** (the minimum - The Trackbar values will be in the range from \f$0\f$ to **alpha_slider_max** (the minimum
@ -51,10 +98,21 @@ We only analyze the code that is related to Trackbar:
- The numerical value of Trackbar is stored in **alpha_slider** - The numerical value of Trackbar is stored in **alpha_slider**
- Whenever the user moves the Trackbar, the callback function **on_trackbar** is called - Whenever the user moves the Trackbar, the callback function **on_trackbar** is called
-# Finally, we have to define the callback function **on_trackbar** Finally, we have to define the callback function **on_trackbar** for C++ and Python code, using an anonymous inner class listener in Java
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
Note that: @add_toggle_cpp
@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
@end_toggle
@add_toggle_java
@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java on_trackbar
@end_toggle
@add_toggle_python
@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py on_trackbar
@end_toggle
Note that (C++ code):
- We use the value of **alpha_slider** (integer) to get a double value for **alpha**. - We use the value of **alpha_slider** (integer) to get a double value for **alpha**.
- **alpha_slider** is updated each time the trackbar is displaced by the user. - **alpha_slider** is updated each time the trackbar is displaced by the user.
- We define *src1*, *src2*, *dist*, *alpha*, *alpha_slider* and *beta* as global variables, - We define *src1*, *src2*, *dist*, *alpha*, *alpha_slider* and *beta* as global variables,

View File

@ -11,9 +11,6 @@ In this tutorial you will learn how to:
- @ref cv::erode - @ref cv::erode
- @ref cv::dilate - @ref cv::dilate
Interesting fact
-----------
@note The explanation below belongs to the book **Learning OpenCV** by Bradski and Kaehler. @note The explanation below belongs to the book **Learning OpenCV** by Bradski and Kaehler.
Morphological Operations Morphological Operations
@ -38,19 +35,14 @@ Morphological Operations
- As the kernel \f$B\f$ is scanned over the image, we compute the maximal pixel value overlapped by - As the kernel \f$B\f$ is scanned over the image, we compute the maximal pixel value overlapped by
\f$B\f$ and replace the image pixel in the anchor point position with that maximal value. As you can \f$B\f$ and replace the image pixel in the anchor point position with that maximal value. As you can
deduce, this maximizing operation causes bright regions within an image to "grow" (therefore the deduce, this maximizing operation causes bright regions within an image to "grow" (therefore the
name *dilation*). Take the above image as an example. Applying dilation we can get: name *dilation*).
- The dilatation operation is: \f$\texttt{dst} (x,y) = \max _{(x',y'): \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
- Take the above image as an example. Applying dilation we can get:
![](images/Morphology_1_Tutorial_Theory_Dilation.png) ![](images/Morphology_1_Tutorial_Theory_Dilation.png)
The background (bright) dilates around the black regions of the letter. - The bright area of the letter dilates around the black regions of the background.
To better grasp the idea and avoid possible confusion, in this other example we have inverted the original
image such as the object in white is now the letter. We have performed two dilatations with a rectangular
structuring element of size `3x3`.
![Left image: original image inverted, right image: resulting dilatation](images/Morphology_1_Tutorial_Theory_Dilatation_2.png)
The dilatation makes the object in white bigger.
### Erosion ### Erosion
@ -58,31 +50,39 @@ The dilatation makes the object in white bigger.
area of given kernel. area of given kernel.
- As the kernel \f$B\f$ is scanned over the image, we compute the minimal pixel value overlapped by - As the kernel \f$B\f$ is scanned over the image, we compute the minimal pixel value overlapped by
\f$B\f$ and replace the image pixel under the anchor point with that minimal value. \f$B\f$ and replace the image pixel under the anchor point with that minimal value.
- The erosion operation is: \f$\texttt{dst} (x,y) = \min _{(x',y'): \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
- Analagously to the example for dilation, we can apply the erosion operator to the original image - Analagously to the example for dilation, we can apply the erosion operator to the original image
(shown above). You can see in the result below that the bright areas of the image (the (shown above). You can see in the result below that the bright areas of the image get thinner,
background, apparently), get thinner, whereas the dark zones (the "writing") gets bigger. whereas the dark zones gets bigger.
![](images/Morphology_1_Tutorial_Theory_Erosion.png) ![](images/Morphology_1_Tutorial_Theory_Erosion.png)
In similar manner, the corresponding image results by applying erosion operation on the inverted original image (two erosions
with a rectangular structuring element of size `3x3`):
![Left image: original image inverted, right image: resulting erosion](images/Morphology_1_Tutorial_Theory_Erosion_2.png)
The erosion makes the object in white smaller.
Code Code
---- ----
@add_toggle_cpp
This tutorial's code is shown below. You can also download it This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp) [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp)
@include samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp @include samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
@end_toggle
@add_toggle_java
This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java)
@include samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
@end_toggle
@add_toggle_python
This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py)
@include samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
@end_toggle
Explanation Explanation
----------- -----------
-# Most of the material shown here is trivial (if you have any doubt, please refer to the tutorials in -# Most of the material shown here is trivial (if you have any doubt, please refer to the tutorials in
previous sections). Let's check the general structure of the program: previous sections). Let's check the general structure of the C++ program:
- Load an image (can be BGR or grayscale) - Load an image (can be BGR or grayscale)
- Create two windows (one for dilation output, the other for erosion) - Create two windows (one for dilation output, the other for erosion)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 410 B

After

Width:  |  Height:  |  Size: 923 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 457 B

After

Width:  |  Height:  |  Size: 844 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 458 B

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 685 B

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 558 B

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.5 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 608 B

After

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 617 B

After

Width:  |  Height:  |  Size: 1.5 KiB

View File

@ -36,15 +36,10 @@ discuss briefly 5 operations offered by OpenCV:
foreground) foreground)
- For instance, check out the example below. The image at the left is the original and the image - For instance, check out the example below. The image at the left is the original and the image
at the right is the result after applying the opening transformation. We can observe that the at the right is the result after applying the opening transformation. We can observe that the
small spaces in the corners of the letter tend to disappear. small dots have disappeared.
![](images/Morphology_2_Tutorial_Theory_Opening.png) ![](images/Morphology_2_Tutorial_Theory_Opening.png)
For the sake of clarity, we have performed the opening operation (`7x7` rectangular structuring element)
on the same original image but inverted such as the object in white is now the letter.
![Left image: original image inverted, right image: resulting opening](images/Morphology_2_Tutorial_Theory_Opening_2.png)
### Closing ### Closing
- It is obtained by the dilation of an image followed by an erosion. - It is obtained by the dilation of an image followed by an erosion.
@ -55,10 +50,6 @@ on the same original image but inverted such as the object in white is now the l
![](images/Morphology_2_Tutorial_Theory_Closing.png) ![](images/Morphology_2_Tutorial_Theory_Closing.png)
On the inverted image, we have performed the closing operation (`7x7` rectangular structuring element):
![Left image: original image inverted, right image: resulting closing](images/Morphology_2_Tutorial_Theory_Closing_2.png)
### Morphological Gradient ### Morphological Gradient
- It is the difference between the dilation and the erosion of an image. - It is the difference between the dilation and the erosion of an image.
@ -88,14 +79,28 @@ On the inverted image, we have performed the closing operation (`7x7` rectangula
Code Code
---- ----
This tutorial code's is shown lines below. You can also download it from @add_toggle_cpp
This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp) [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp)
@include cpp/tutorial_code/ImgProc/Morphology_2.cpp @include cpp/tutorial_code/ImgProc/Morphology_2.cpp
@end_toggle
@add_toggle_java
This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java)
@include java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
@end_toggle
@add_toggle_python
This tutorial's code is shown below. You can also download it
[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py)
@include python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
@end_toggle
Explanation Explanation
----------- -----------
-# Let's check the general structure of the program: -# Let's check the general structure of the C++ program:
- Load an image - Load an image
- Create a window to display results of the Morphological operations - Create a window to display results of the Morphological operations
- Create three Trackbars for the user to enter parameters: - Create three Trackbars for the user to enter parameters:
@ -139,8 +144,8 @@ Explanation
Results Results
------- -------
- After compiling the code above we can execute it giving an image path as an argument. For this - After compiling the code above we can execute it giving an image path as an argument. Results using
tutorial we use as input the image: **baboon.png**: the image: **baboon.png**:
![](images/Morphology_2_Tutorial_Original_Image.jpg) ![](images/Morphology_2_Tutorial_Original_Image.jpg)

View File

@ -305,6 +305,9 @@ public:
//! returns true if GpuMat data is NULL //! returns true if GpuMat data is NULL
bool empty() const; bool empty() const;
//! internal use method: updates the continuity flag
void updateContinuityFlag();
/*! includes several bit-fields: /*! includes several bit-fields:
- the magic signature - the magic signature
- continuity flag - continuity flag

View File

@ -2084,6 +2084,9 @@ public:
static MatAllocator* getDefaultAllocator(); static MatAllocator* getDefaultAllocator();
static void setDefaultAllocator(MatAllocator* allocator); static void setDefaultAllocator(MatAllocator* allocator);
//! internal use method: updates the continuity flag
void updateContinuityFlag();
//! interaction with UMat //! interaction with UMat
UMatData* u; UMatData* u;
@ -2551,6 +2554,9 @@ public:
//! and the standard allocator //! and the standard allocator
static MatAllocator* getStdAllocator(); static MatAllocator* getStdAllocator();
//! internal use method: updates the continuity flag
void updateContinuityFlag();
// black-box container of UMat data // black-box container of UMat data
UMatData* u; UMatData* u;

View File

@ -495,24 +495,20 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
if( _step == AUTO_STEP ) if( _step == AUTO_STEP )
{ {
_step = minstep; _step = minstep;
flags |= CONTINUOUS_FLAG;
} }
else else
{ {
CV_DbgAssert( _step >= minstep ); CV_DbgAssert( _step >= minstep );
if (_step % esz1 != 0) if (_step % esz1 != 0)
{ {
CV_Error(Error::BadStep, "Step must be a multiple of esz1"); CV_Error(Error::BadStep, "Step must be a multiple of esz1");
} }
if (_step == minstep || rows == 1)
flags |= CONTINUOUS_FLAG;
} }
step[0] = _step; step[0] = _step;
step[1] = esz; step[1] = esz;
datalimit = datastart + _step * rows; datalimit = datastart + _step * rows;
dataend = datalimit - _step + minstep; dataend = datalimit - _step + minstep;
updateContinuityFlag();
} }
inline inline
@ -528,7 +524,6 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
if( _step == AUTO_STEP ) if( _step == AUTO_STEP )
{ {
_step = minstep; _step = minstep;
flags |= CONTINUOUS_FLAG;
} }
else else
{ {
@ -538,14 +533,12 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
{ {
CV_Error(Error::BadStep, "Step must be a multiple of esz1"); CV_Error(Error::BadStep, "Step must be a multiple of esz1");
} }
if (_step == minstep || rows == 1)
flags |= CONTINUOUS_FLAG;
} }
step[0] = _step; step[0] = _step;
step[1] = esz; step[1] = esz;
datalimit = datastart + _step*rows; datalimit = datastart + _step*rows;
dataend = datalimit - _step + minstep; dataend = datalimit - _step + minstep;
updateContinuityFlag();
} }
template<typename _Tp> inline template<typename _Tp> inline

View File

@ -152,7 +152,7 @@ namespace cv { namespace cuda
inline ~NppStreamHandler() inline ~NppStreamHandler()
{ {
nppSetStream(oldStream); cudaStreamSynchronize(oldStream);
} }
private: private:

View File

@ -489,7 +489,7 @@ public class MatTest extends OpenCVTestCase {
public void testIsContinuous() { public void testIsContinuous() {
assertTrue(gray0.isContinuous()); assertTrue(gray0.isContinuous());
Mat subMat = gray0.submat(0, 0, gray0.rows() / 2, gray0.cols() / 2); Mat subMat = gray0.submat(0, gray0.rows() / 2, 0, gray0.cols() / 2);
assertFalse(subMat.isContinuous()); assertFalse(subMat.isContinuous());
} }
@ -937,7 +937,7 @@ public class MatTest extends OpenCVTestCase {
} }
public void testSubmatRect() { public void testSubmatRect() {
Mat submat = gray255.submat(new Rect(5, gray255.rows() / 2, 5, gray255.cols() / 2)); Mat submat = gray255.submat(new Rect(5, 5, gray255.cols() / 2, gray255.rows() / 2));
assertTrue(submat.isSubmatrix()); assertTrue(submat.isSubmatrix());
assertFalse(submat.isContinuous()); assertFalse(submat.isContinuous());

View File

@ -46,6 +46,13 @@
using namespace cv; using namespace cv;
using namespace cv::cuda; using namespace cv::cuda;
void cv::cuda::GpuMat::updateContinuityFlag()
{
int sz[] = { rows, cols };
size_t steps[] = { step, elemSize() };
flags = cv::updateContinuityFlag(flags, 2, sz, steps);
}
cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
step(step_), data((uchar*)data_), refcount(0), step(step_), data((uchar*)data_), refcount(0),
@ -57,7 +64,6 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
if (step == Mat::AUTO_STEP) if (step == Mat::AUTO_STEP)
{ {
step = minstep; step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
} }
else else
{ {
@ -65,11 +71,10 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
step = minstep; step = minstep;
CV_DbgAssert( step >= minstep ); CV_DbgAssert( step >= minstep );
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
} }
dataend += step * (rows - 1) + minstep; dataend += step * (rows - 1) + minstep;
updateContinuityFlag();
} }
cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
@ -83,7 +88,6 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
if (step == Mat::AUTO_STEP) if (step == Mat::AUTO_STEP)
{ {
step = minstep; step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
} }
else else
{ {
@ -91,11 +95,10 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
step = minstep; step = minstep;
CV_DbgAssert( step >= minstep ); CV_DbgAssert( step >= minstep );
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
} }
dataend += step * (rows - 1) + minstep; dataend += step * (rows - 1) + minstep;
updateContinuityFlag();
} }
cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_) cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
@ -127,17 +130,15 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
cols = colRange_.size(); cols = colRange_.size();
data += colRange_.start*elemSize(); data += colRange_.start*elemSize();
flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
} }
if (rows == 1)
flags |= Mat::CONTINUOUS_FLAG;
if (refcount) if (refcount)
CV_XADD(refcount, 1); CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0) if (rows <= 0 || cols <= 0)
rows = cols = 0; rows = cols = 0;
updateContinuityFlag();
} }
cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) : cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
@ -146,16 +147,19 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
datastart(m.datastart), dataend(m.dataend), datastart(m.datastart), dataend(m.dataend),
allocator(m.allocator) allocator(m.allocator)
{ {
flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
data += roi.x * elemSize(); data += roi.x * elemSize();
CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows ); CV_Assert( 0 <= roi.x && 0 <= roi.width &&
roi.x + roi.width <= m.cols &&
0 <= roi.y && 0 <= roi.height &&
roi.y + roi.height <= m.rows );
if (refcount) if (refcount)
CV_XADD(refcount, 1); CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0) if (rows <= 0 || cols <= 0)
rows = cols = 0; rows = cols = 0;
updateContinuityFlag();
} }
GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const
@ -245,11 +249,7 @@ GpuMat& cv::cuda::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright
rows = row2 - row1; rows = row2 - row1;
cols = col2 - col1; cols = col2 - col1;
if (esz * cols == step || rows == 1) updateContinuityFlag();
flags |= Mat::CONTINUOUS_FLAG;
else
flags &= ~Mat::CONTINUOUS_FLAG;
return *this; return *this;
} }

View File

@ -201,10 +201,13 @@ void cv::cuda::HostMem::create(int rows_, int cols_, int type_)
if (rows_ > 0 && cols_ > 0) if (rows_ > 0 && cols_ > 0)
{ {
flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_; flags = Mat::MAGIC_VAL + type_;
rows = rows_; rows = rows_;
cols = cols_; cols = cols_;
step = elemSize() * cols; step = elemSize() * cols;
int sz[] = { rows, cols };
size_t steps[] = { step, CV_ELEM_SIZE(type_) };
flags = updateContinuityFlag(flags, 2, sz, steps);
if (alloc_type == SHARED) if (alloc_type == SHARED)
{ {

View File

@ -594,11 +594,12 @@ namespace
StackAllocator::~StackAllocator() StackAllocator::~StackAllocator()
{ {
cudaStreamSynchronize(stream_);
if (memStack_ != 0) if (memStack_ != 0)
{
cudaStreamSynchronize(stream_);
memStack_->pool->returnMemStack(memStack_); memStack_->pool->returnMemStack(memStack_);
} }
}
size_t alignUp(size_t what, size_t alignment) size_t alignUp(size_t what, size_t alignment)
{ {

View File

@ -262,31 +262,36 @@ void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool auto
} }
} }
static void updateContinuityFlag(Mat& m) int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
{ {
int i, j; int i, j;
for( i = 0; i < m.dims; i++ ) for( i = 0; i < dims; i++ )
{ {
if( m.size[i] > 1 ) if( size[i] > 1 )
break; break;
} }
for( j = m.dims-1; j > i; j-- ) uint64 t = (uint64)size[std::min(i, dims-1)]*CV_MAT_CN(flags);
for( j = dims-1; j > i; j-- )
{ {
if( m.step[j]*m.size[j] < m.step[j-1] ) t *= size[j];
if( step[j]*size[j] < step[j-1] )
break; break;
} }
uint64 t = (uint64)m.step[0]*m.size[0]; if( j <= i && t == (uint64)(int)t )
if( j <= i && t == (size_t)t ) return flags | Mat::CONTINUOUS_FLAG;
m.flags |= Mat::CONTINUOUS_FLAG; return flags & ~Mat::CONTINUOUS_FLAG;
else }
m.flags &= ~Mat::CONTINUOUS_FLAG;
void Mat::updateContinuityFlag()
{
flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
} }
void finalizeHdr(Mat& m) void finalizeHdr(Mat& m)
{ {
updateContinuityFlag(m); m.updateContinuityFlag();
int d = m.dims; int d = m.dims;
if( d > 2 ) if( d > 2 )
m.rows = m.cols = -1; m.rows = m.cols = -1;
@ -427,7 +432,6 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
&& _colRange.end <= m.cols ); && _colRange.end <= m.cols );
cols = _colRange.size(); cols = _colRange.size();
data += _colRange.start*elemSize(); data += _colRange.start*elemSize();
flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
} }
@ -437,8 +441,7 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
CV_RETHROW(); CV_RETHROW();
} }
if( rows == 1 ) updateContinuityFlag();
flags |= CONTINUOUS_FLAG;
if( rows <= 0 || cols <= 0 ) if( rows <= 0 || cols <= 0 )
{ {
@ -455,8 +458,6 @@ Mat::Mat(const Mat& m, const Rect& roi)
allocator(m.allocator), u(m.u), size(&rows) allocator(m.allocator), u(m.u), size(&rows)
{ {
CV_Assert( m.dims <= 2 ); CV_Assert( m.dims <= 2 );
flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
size_t esz = CV_ELEM_SIZE(flags); size_t esz = CV_ELEM_SIZE(flags);
data += roi.x*esz; data += roi.x*esz;
@ -468,6 +469,7 @@ Mat::Mat(const Mat& m, const Rect& roi)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
step[0] = m.step[0]; step[1] = esz; step[0] = m.step[0]; step[1] = esz;
updateContinuityFlag();
if( rows <= 0 || cols <= 0 ) if( rows <= 0 || cols <= 0 )
{ {
@ -522,7 +524,7 @@ Mat::Mat(const Mat& m, const Range* ranges)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
} }
updateContinuityFlag(*this); updateContinuityFlag();
} }
Mat::Mat(const Mat& m, const std::vector<Range>& ranges) Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
@ -548,7 +550,7 @@ Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
} }
updateContinuityFlag(*this); updateContinuityFlag();
} }
@ -575,10 +577,7 @@ Mat Mat::diag(int d) const
m.size[1] = m.cols = 1; m.size[1] = m.cols = 1;
m.step[0] += (len > 1 ? esz : 0); m.step[0] += (len > 1 ? esz : 0);
if( m.rows > 1 ) m.updateContinuityFlag();
m.flags &= ~CONTINUOUS_FLAG;
else
m.flags |= CONTINUOUS_FLAG;
if( size() != Size(1,1) ) if( size() != Size(1,1) )
m.flags |= SUBMATRIX_FLAG; m.flags |= SUBMATRIX_FLAG;
@ -597,13 +596,6 @@ void Mat::pop_back(size_t nelems)
{ {
size.p[0] -= (int)nelems; size.p[0] -= (int)nelems;
dataend -= nelems*step.p[0]; dataend -= nelems*step.p[0];
/*if( size.p[0] <= 1 )
{
if( dims <= 2 )
flags |= CONTINUOUS_FLAG;
else
updateContinuityFlag(*this);
}*/
} }
} }
@ -618,7 +610,10 @@ void Mat::push_back_(const void* elem)
memcpy(data + r*step.p[0], elem, esz); memcpy(data + r*step.p[0], elem, esz);
size.p[0] = r + 1; size.p[0] = r + 1;
dataend += step.p[0]; dataend += step.p[0];
if( esz < step.p[0] ) uint64 tsz = size.p[0];
for( int i = 1; i < dims; i++ )
tsz *= size.p[i];
if( esz < step.p[0] || tsz != (uint64)(int)tsz )
flags &= ~CONTINUOUS_FLAG; flags &= ~CONTINUOUS_FLAG;
} }
@ -792,10 +787,7 @@ Mat& Mat::adjustROI( int dtop, int dbottom, int dleft, int dright )
data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz; data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
rows = row2 - row1; cols = col2 - col1; rows = row2 - row1; cols = col2 - col1;
size.p[0] = rows; size.p[1] = cols; size.p[0] = rows; size.p[1] = cols;
if( esz*cols == step[0] || rows == 1 ) updateContinuityFlag();
flags |= CONTINUOUS_FLAG;
else
flags &= ~CONTINUOUS_FLAG;
return *this; return *this;
} }

View File

@ -120,8 +120,8 @@ static Mat iplImageToMat(const IplImage* img, bool copyData)
} }
m.datalimit = m.datastart + m.step.p[0]*m.rows; m.datalimit = m.datastart + m.step.p[0]*m.rows;
m.dataend = m.datastart + m.step.p[0]*(m.rows-1) + esz*m.cols; m.dataend = m.datastart + m.step.p[0]*(m.rows-1) + esz*m.cols;
m.flags |= (m.cols*esz == m.step.p[0] || m.rows == 1 ? Mat::CONTINUOUS_FLAG : 0);
m.step[1] = esz; m.step[1] = esz;
m.updateContinuityFlag();
if( copyData ) if( copyData )
{ {

View File

@ -5681,8 +5681,6 @@ namespace cv {
// three funcs below are implemented in umatrix.cpp // three funcs below are implemented in umatrix.cpp
void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps, void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
bool autoSteps = false ); bool autoSteps = false );
void updateContinuityFlag(UMat& m);
void finalizeHdr(UMat& m); void finalizeHdr(UMat& m);
} // namespace cv } // namespace cv

View File

@ -193,6 +193,7 @@ inline Size getContinuousSize( const Mat& m1, const Mat& m2,
void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false ); void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false );
void finalizeHdr(Mat& m); void finalizeHdr(Mat& m);
int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step);
struct NoVec struct NoVec
{ {

View File

@ -318,32 +318,15 @@ void setSize( UMat& m, int _dims, const int* _sz,
} }
void updateContinuityFlag(UMat& m) void UMat::updateContinuityFlag()
{ {
int i, j; flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
for( i = 0; i < m.dims; i++ )
{
if( m.size[i] > 1 )
break;
}
for( j = m.dims-1; j > i; j-- )
{
if( m.step[j]*m.size[j] < m.step[j-1] )
break;
}
uint64 total = (uint64)m.step[0]*m.size[0];
if( j <= i && total == (size_t)total )
m.flags |= UMat::CONTINUOUS_FLAG;
else
m.flags &= ~UMat::CONTINUOUS_FLAG;
} }
void finalizeHdr(UMat& m) void finalizeHdr(UMat& m)
{ {
updateContinuityFlag(m); m.updateContinuityFlag();
int d = m.dims; int d = m.dims;
if( d > 2 ) if( d > 2 )
m.rows = m.cols = -1; m.rows = m.cols = -1;
@ -537,12 +520,10 @@ UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols ); CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols );
cols = _colRange.size(); cols = _colRange.size();
offset += _colRange.start*elemSize(); offset += _colRange.start*elemSize();
flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
if( rows == 1 ) updateContinuityFlag();
flags |= CONTINUOUS_FLAG;
if( rows <= 0 || cols <= 0 ) if( rows <= 0 || cols <= 0 )
{ {
@ -557,8 +538,6 @@ UMat::UMat(const UMat& m, const Rect& roi)
allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows) allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
{ {
CV_Assert( m.dims <= 2 ); CV_Assert( m.dims <= 2 );
flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
size_t esz = CV_ELEM_SIZE(flags); size_t esz = CV_ELEM_SIZE(flags);
offset += roi.x*esz; offset += roi.x*esz;
@ -570,6 +549,7 @@ UMat::UMat(const UMat& m, const Rect& roi)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
step[0] = m.step[0]; step[1] = esz; step[0] = m.step[0]; step[1] = esz;
updateContinuityFlag();
if( rows <= 0 || cols <= 0 ) if( rows <= 0 || cols <= 0 )
{ {
@ -601,7 +581,7 @@ UMat::UMat(const UMat& m, const Range* ranges)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
} }
updateContinuityFlag(*this); updateContinuityFlag();
} }
UMat::UMat(const UMat& m, const std::vector<Range>& ranges) UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
@ -626,7 +606,7 @@ UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
flags |= SUBMATRIX_FLAG; flags |= SUBMATRIX_FLAG;
} }
} }
updateContinuityFlag(*this); updateContinuityFlag();
} }
UMat UMat::diag(int d) const UMat UMat::diag(int d) const
@ -652,10 +632,7 @@ UMat UMat::diag(int d) const
m.size[1] = m.cols = 1; m.size[1] = m.cols = 1;
m.step[0] += (len > 1 ? esz : 0); m.step[0] += (len > 1 ? esz : 0);
if( m.rows > 1 ) m.updateContinuityFlag();
m.flags &= ~CONTINUOUS_FLAG;
else
m.flags |= CONTINUOUS_FLAG;
if( size() != Size(1,1) ) if( size() != Size(1,1) )
m.flags |= SUBMATRIX_FLAG; m.flags |= SUBMATRIX_FLAG;
@ -701,10 +678,7 @@ UMat& UMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz; offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
rows = row2 - row1; cols = col2 - col1; rows = row2 - row1; cols = col2 - col1;
size.p[0] = rows; size.p[1] = cols; size.p[0] = rows; size.p[1] = cols;
if( esz*cols == step[0] || rows == 1 ) updateContinuityFlag();
flags |= CONTINUOUS_FLAG;
else
flags &= ~CONTINUOUS_FLAG;
return *this; return *this;
} }

View File

@ -522,33 +522,23 @@ protected:
TEST(Core_InputOutput, misc) { CV_MiscIOTest test; test.safe_run(); } TEST(Core_InputOutput, misc) { CV_MiscIOTest test; test.safe_run(); }
/*class CV_BigMatrixIOTest : public cvtest::BaseTest #if 0 // 4+ GB of data, 40+ GB of estimated result size, it is very slow
{ BIGDATA_TEST(Core_InputOutput, huge)
public:
CV_BigMatrixIOTest() {}
~CV_BigMatrixIOTest() {}
protected:
void run(int)
{
try
{ {
RNG& rng = theRNG(); RNG& rng = theRNG();
int N = 1000, M = 1200000; int N = 1000, M = 1200000;
std::cout << "Allocating..." << std::endl;
Mat mat(M, N, CV_32F); Mat mat(M, N, CV_32F);
std::cout << "Initializing..." << std::endl;
rng.fill(mat, RNG::UNIFORM, 0, 1); rng.fill(mat, RNG::UNIFORM, 0, 1);
std::cout << "Writing..." << std::endl;
{
FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE); FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
fs << "mat" << mat; fs << "mat" << mat;
fs.release(); fs.release();
} }
catch(...)
{
ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
} }
} #endif
};
TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
*/
TEST(Core_globbing, accuracy) TEST(Core_globbing, accuracy)
{ {

View File

@ -1766,4 +1766,26 @@ TEST(Mat_, template_based_ptr)
ASSERT_FLOAT_EQ(66.0f, *(mat.ptr<float>(idx))); ASSERT_FLOAT_EQ(66.0f, *(mat.ptr<float>(idx)));
} }
BIGDATA_TEST(Mat, push_back_regression_4158) // memory usage: ~10.6 Gb
{
Mat result;
Mat tail(100, 500000, CV_32FC2, Scalar(1, 2));
tail.copyTo(result);
for (int i = 1; i < 15; i++)
{
result.push_back(tail);
std::cout << "i = " << i << " result = " << result.size() << " used = " << (uint64)result.total()*result.elemSize()*(1.0 / (1 << 20)) << " Mb"
<< " allocated=" << (uint64)(result.datalimit - result.datastart)*(1.0 / (1 << 20)) << " Mb" << std::endl;
}
for (int i = 0; i < 15; i++)
{
Rect roi(0, tail.rows * i, tail.cols, tail.rows);
int nz = countNonZero(result(roi).reshape(1) == 2);
EXPECT_EQ(tail.total(), (size_t)nz) << "i=" << i;
}
}
}} // namespace }} // namespace

View File

@ -137,12 +137,11 @@ void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
if (!deviceSupports(FEATURE_SET_COMPUTE_13)) if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
GpuMat src = getInputMat(_src, stream); const GpuMat src = getInputMat(_src, stream);
CV_Assert( src.type() == CV_8UC1 ); CV_Assert( src.type() == CV_8UC1 );
_dst.create(1, 2, CV_64FC1); GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
GpuMat dst = _dst.getGpuMat();
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;

View File

@ -826,6 +826,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
CV_OUT std::vector<int>& indices, CV_OUT std::vector<int>& indices,
const float eta = 1.f, const int top_k = 0); const float eta = 1.f, const int top_k = 0);
CV_EXPORTS void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
const float score_threshold, const float nms_threshold,
CV_OUT std::vector<int>& indices,
const float eta = 1.f, const int top_k = 0);
//! @} //! @}
CV__DNN_EXPERIMENTAL_NS_END CV__DNN_EXPERIMENTAL_NS_END

View File

@ -121,7 +121,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
PERF_TEST_P_(DNNTestNetwork, ENet) PERF_TEST_P_(DNNTestNetwork, ENet)
{ {
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException(""); if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/Enet-model-best.net", "", "enet.yml", processNet("dnn/Enet-model-best.net", "", "enet.yml",
Mat(cv::Size(512, 256), CV_32FC3)); Mat(cv::Size(512, 256), CV_32FC3));
} }
@ -232,7 +234,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
#endif #endif
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL) tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
}; };
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases)); INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));

View File

@ -62,6 +62,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
// this option is useful to run valgrind memory errors detection // this option is useful to run valgrind memory errors detection
static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false); static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
#ifdef HAVE_OPENCL
static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
#endif
using std::vector; using std::vector;
using std::map; using std::map;
using std::make_pair; using std::make_pair;
@ -497,7 +501,7 @@ public:
} }
} }
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate) void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate, bool use_half)
{ {
if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate) if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate)
{ {
@ -538,14 +542,14 @@ public:
{ {
// if dst already has been allocated with total(shape) elements, // if dst already has been allocated with total(shape) elements,
// it won't be recrreated and pointer of dst.data remains the same. // it won't be recrreated and pointer of dst.data remains the same.
dst.create(shape, CV_32F); dst.create(shape, use_half ? CV_16S : CV_32F);
addHost(lp, dst); addHost(lp, dst);
} }
} }
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
std::vector<LayerPin>& pinsForInternalBlobs, std::vector<LayerPin>& pinsForInternalBlobs,
bool forceCreate = false) bool forceCreate = false, bool use_half = false)
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
@ -616,7 +620,7 @@ public:
reuse(ld.inputBlobsId[0], blobPin); reuse(ld.inputBlobsId[0], blobPin);
} }
else else
reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate); reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate, use_half);
} }
} }
} }
@ -654,7 +658,7 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
{ {
if (targetId == DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU)
return Ptr<BackendWrapper>(); return Ptr<BackendWrapper>();
else if (targetId == DNN_TARGET_OPENCL) else if (IS_DNN_OPENCL_TARGET(targetId))
return OpenCLBackendWrapper::create(m); return OpenCLBackendWrapper::create(m);
else else
CV_Error(Error::StsNotImplemented, "Unknown target identifier"); CV_Error(Error::StsNotImplemented, "Unknown target identifier");
@ -719,6 +723,7 @@ struct Net::Impl
bool netWasAllocated; bool netWasAllocated;
bool fusion; bool fusion;
std::vector<int64> layersTimings; std::vector<int64> layersTimings;
Mat output_blob;
Ptr<BackendWrapper> wrap(Mat& host) Ptr<BackendWrapper> wrap(Mat& host)
{ {
@ -735,7 +740,7 @@ struct Net::Impl
Ptr<BackendWrapper> baseBuffer = backendWrappers[data]; Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
if (preferableBackend == DNN_BACKEND_DEFAULT) if (preferableBackend == DNN_BACKEND_DEFAULT)
{ {
CV_Assert(preferableTarget == DNN_TARGET_OPENCL); CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
return OpenCLBackendWrapper::create(baseBuffer, host); return OpenCLBackendWrapper::create(baseBuffer, host);
} }
else if (preferableBackend == DNN_BACKEND_HALIDE) else if (preferableBackend == DNN_BACKEND_HALIDE)
@ -847,12 +852,22 @@ struct Net::Impl
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_) if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
{ {
if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
#ifndef HAVE_OPENCL #ifndef HAVE_OPENCL
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
{ {
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.") CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
preferableTarget = DNN_TARGET_CPU; preferableTarget = DNN_TARGET_CPU;
} }
#else
{
if (!DNN_OPENCL_ALLOW_ALL_DEVICES
&& !(ocl::Device::getDefault().isIntel() && ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU) // Current implementation is only valid for Intel GPU (#11494)
)
{
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with Intel GPUs only), switching to CPU.");
preferableTarget = DNN_TARGET_CPU;
}
}
#endif #endif
clear(); clear();
@ -1022,7 +1037,7 @@ struct Net::Impl
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
if (preferableBackend == DNN_BACKEND_DEFAULT) if (preferableBackend == DNN_BACKEND_DEFAULT)
CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
else if (preferableBackend == DNN_BACKEND_HALIDE) else if (preferableBackend == DNN_BACKEND_HALIDE)
initHalideBackend(); initHalideBackend();
else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE) else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
@ -1357,7 +1372,9 @@ struct Net::Impl
std::vector<LayerPin> pinsForInternalBlobs; std::vector<LayerPin> pinsForInternalBlobs;
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
preferableBackend == DNN_BACKEND_INFERENCE_ENGINE); preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableTarget == DNN_TARGET_OPENCL_FP16);
ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
for (int i = 0; i < ld.outputBlobs.size(); ++i) for (int i = 0; i < ld.outputBlobs.size(); ++i)
{ {
@ -1427,7 +1444,7 @@ struct Net::Impl
// some other layers. // some other layers.
// TODO: OpenCL target support more fusion styles. // TODO: OpenCL target support more fusion styles.
if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL && if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" && (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
ld.layerInstance->type != "MVN")) ) ld.layerInstance->type != "MVN")) )
continue; continue;
@ -1466,8 +1483,8 @@ struct Net::Impl
continue; // Go to the next layer. continue; // Go to the next layer.
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
if ( preferableTarget != DNN_TARGET_OPENCL || if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
(preferableTarget == DNN_TARGET_OPENCL && (IS_DNN_OPENCL_TARGET(preferableTarget) &&
nextData && nextData &&
((nextData->type == "ReLU") || ((nextData->type == "ReLU") ||
(nextData->type == "ChannelsPReLU") || (nextData->type == "ChannelsPReLU") ||
@ -1490,7 +1507,7 @@ struct Net::Impl
ld.outputBlobs = layers[lpNext.lid].outputBlobs; ld.outputBlobs = layers[lpNext.lid].outputBlobs;
ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers; ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
if ( preferableTarget == DNN_TARGET_OPENCL ) if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
{ {
if ( !activData->consumers.empty() ) if ( !activData->consumers.empty() )
{ {
@ -1502,7 +1519,7 @@ struct Net::Impl
} }
// fuse convlution layer followed by eltwise + relu // fuse convlution layer followed by eltwise + relu
if ( preferableTarget == DNN_TARGET_OPENCL ) if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
{ {
Ptr<EltwiseLayer> nextEltwiseLayer; Ptr<EltwiseLayer> nextEltwiseLayer;
if( nextData ) if( nextData )
@ -1715,6 +1732,13 @@ struct Net::Impl
for(int i = 0; i < layers[0].outputBlobs.size(); i++) for(int i = 0; i < layers[0].outputBlobs.size(); i++)
{ {
CV_Assert(layers[0].outputBlobs[i].total()); CV_Assert(layers[0].outputBlobs[i].total());
if (layers[0].outputBlobs[i].depth() == CV_32F &&
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat mat = layers[0].outputBlobs[i].clone();
convertFp16(mat, layers[0].outputBlobs[i]);
}
inputShapes.push_back(shape(layers[0].outputBlobs[i])); inputShapes.push_back(shape(layers[0].outputBlobs[i]));
} }
LayersShapesMap layersShapes; LayersShapesMap layersShapes;
@ -1760,7 +1784,7 @@ struct Net::Impl
{ {
if( !ld.skip ) if( !ld.skip )
{ {
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL) if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers), layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@ -1925,6 +1949,13 @@ struct Net::Impl
// Transfer data to CPU if it's require. // Transfer data to CPU if it's require.
ld.outputBlobsWrappers[pin.oid]->copyToHost(); ld.outputBlobsWrappers[pin.oid]->copyToHost();
} }
if (ld.outputBlobs[pin.oid].depth() == CV_16S)
{
convertFp16(ld.outputBlobs[pin.oid], output_blob);
return output_blob;
}
else
return ld.outputBlobs[pin.oid]; return ld.outputBlobs[pin.oid];
} }
@ -2068,7 +2099,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
if (outputBlobs.isUMat()) if (outputBlobs.isUMat())
{ {
outputBlobs.assign(ld.outputBlobs[pin.oid].getUMat(ACCESS_RW)); outputBlobs.assign(impl->getBlob(layerName).getUMat(ACCESS_RW));
} }
else if (outputBlobs.isMat()) else if (outputBlobs.isMat())
{ {
@ -2084,17 +2115,33 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
ld.outputBlobsWrappers[i]->copyToHost(); ld.outputBlobsWrappers[i]->copyToHost();
} }
} }
if (ld.outputBlobs[0].depth() == CV_32F)
{
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj(); std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = ld.outputBlobs; outputvec = ld.outputBlobs;
} else {
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec.resize(ld.outputBlobs.size());
for (int i = 0; i < outputvec.size(); i++)
convertFp16(ld.outputBlobs[i], outputvec[i]);
}
} }
else if (outputBlobs.isUMatVector()) else if (outputBlobs.isUMatVector())
{ {
std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj(); std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
if (impl->preferableBackend == DNN_BACKEND_DEFAULT && if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
impl->preferableTarget == DNN_TARGET_OPENCL) IS_DNN_OPENCL_TARGET(impl->preferableTarget))
{ {
if (impl->preferableTarget == DNN_TARGET_OPENCL)
outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
{
std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
outputvec.resize(out_vec.size());
for (int i = 0; i < out_vec.size(); i++)
convertFp16(out_vec[i], outputvec[i]);
}
} }
else else
{ {
@ -2182,6 +2229,16 @@ void Net::setPreferableTarget(int targetId)
if( impl->preferableTarget != targetId ) if( impl->preferableTarget != targetId )
{ {
impl->preferableTarget = targetId; impl->preferableTarget = targetId;
if (IS_DNN_OPENCL_TARGET(targetId))
{
#ifndef HAVE_OPENCL
impl->preferableTarget = DNN_TARGET_CPU;
#else
bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
impl->preferableTarget = DNN_TARGET_OPENCL;
#endif
}
impl->netWasAllocated = false; impl->netWasAllocated = false;
impl->clear(); impl->clear();
} }
@ -2210,7 +2267,17 @@ void Net::setInput(InputArray blob, const String& name)
ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
MatShape prevShape = shape(ld.outputBlobs[pin.oid]); MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
Mat blob_ = blob.getMat(); Mat blob_;
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat blob_mat = blob.getMat();
convertFp16(blob_mat, blob_);
}
else
{
blob_ = blob.getMat();
}
bool oldShape = prevShape == shape(blob_); bool oldShape = prevShape == shape(blob_);
if (oldShape) if (oldShape)
{ {
@ -2735,6 +2802,43 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
{
std::vector<UMat> inputs;
std::vector<UMat> outputs;
std::vector<UMat> internals;
std::vector<UMat> orig_inputs;
std::vector<UMat> orig_outputs;
std::vector<UMat> orig_internals;
inputs_arr.getUMatVector(orig_inputs);
outputs_arr.getUMatVector(orig_outputs);
internals_arr.getUMatVector(orig_internals);
inputs.resize(orig_inputs.size());
for (size_t i = 0; i < orig_inputs.size(); i++)
convertFp16(orig_inputs[i], inputs[i]);
outputs.resize(orig_outputs.size());
for (size_t i = 0; i < orig_outputs.size(); i++)
outputs[i].create(shape(orig_outputs[i]), CV_32F);
internals.resize(orig_internals.size());
for (size_t i = 0; i < orig_internals.size(); i++)
internals[i].create(shape(orig_internals[i]), CV_32F);
forward(inputs, outputs, internals);
for (size_t i = 0; i < outputs.size(); i++)
convertFp16(outputs[i], orig_outputs[i]);
// sync results back
outputs_arr.assign(orig_outputs);
internals_arr.assign(orig_internals);
return;
}
std::vector<Mat> inpvec; std::vector<Mat> inpvec;
std::vector<Mat> outputs; std::vector<Mat> outputs;
std::vector<Mat> internals; std::vector<Mat> internals;

View File

@ -120,12 +120,16 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
CV_Assert(blobs.size() >= 2); CV_Assert(blobs.size() >= 2);
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
if (use_half && inputs[0].dims == 2)
return false;
if (umat_weight.empty()) if (umat_weight.empty())
{ {
umat_weight = weights_.getUMat(ACCESS_READ); umat_weight = weights_.getUMat(ACCESS_READ);
@ -139,6 +143,7 @@ public:
int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1; int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1; int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
if (inpBlob.dims == 2) if (inpBlob.dims == 2)
@ -154,8 +159,12 @@ public:
UMat src = inputs[ii].reshape(1, s.size(), &s[0]); UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
String kname = format("batch_norm%d", number); String kname = format("batch_norm%d", number);
if (number == 1)
buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
else
buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
@ -181,7 +190,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -95,7 +95,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -128,14 +128,14 @@ public:
for( i = 0; i < ninputs; i++ ) for( i = 0; i < ninputs; i++ )
{ {
Mat& inp = *inputs[i]; Mat& inp = *inputs[i];
CV_Assert( inp.isContinuous() && inp.type() == CV_32F && CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
inp.dims == 4 && inp.size[0] == output.size[0] && inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] && inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] ); inp.size[3] == output.size[3] );
nchannels += inp.size[1]; nchannels += inp.size[1];
} }
CV_Assert( nchannels == output.size[1] ); CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && output.type() == CV_32F ); CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
cc.chptrs.resize(nchannels*batchsz); cc.chptrs.resize(nchannels*batchsz);
@ -186,6 +186,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -199,11 +200,12 @@ public:
int num_concats = total(shape(inputs[0]), 0, cAxis); int num_concats = total(shape(inputs[0]), 0, cAxis);
int offset_concat_axis = 0; int offset_concat_axis = 0;
UMat& outMat = outputs[0]; UMat& outMat = outputs[0];
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
String kname = format("concat_%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
@ -235,7 +237,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -94,7 +94,7 @@ public:
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height); CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
const Mat &input = *inputs[0]; const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F)); CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
CV_Assert(inputs[i]->type() == input.type()); CV_Assert(inputs[i]->type() == input.type());
@ -288,7 +288,7 @@ public:
newActiv = true; newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
if (preferableTarget == DNN_TARGET_OPENCL) if (IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>(); Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty()) if (!activ_power.empty())
@ -842,6 +842,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -860,6 +861,7 @@ public:
config.dilation = dilation; config.dilation = dilation;
config.group = inputs[0].size[1] / umat_blobs[0].size[1]; config.group = inputs[0].size[1] / umat_blobs[0].size[1];
config.bias_term = (hasBias()) ? true : false; config.bias_term = (hasBias()) ? true : false;
config.use_half = use_half;
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config)); convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
} }
@ -964,7 +966,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
@ -1360,6 +1362,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -1450,7 +1455,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -307,8 +307,24 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
if (use_half)
{
std::vector<UMat> orig_inputs;
std::vector<UMat> orig_outputs;
inps.getUMatVector(orig_inputs);
outs.getUMatVector(orig_outputs);
inputs.resize(orig_inputs.size());
for (size_t i = 0; i < orig_inputs.size(); i++)
convertFp16(orig_inputs[i], inputs[i]);
}
else
{
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
}
std::vector<LabelBBox> allDecodedBBoxes; std::vector<LabelBBox> allDecodedBBoxes;
std::vector<Mat> allConfidenceScores; std::vector<Mat> allConfidenceScores;
@ -342,6 +358,12 @@ public:
{ {
// Set confidences to zeros. // Set confidences to zeros.
Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
if (use_half)
{
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs[0](ranges).setTo(0);
} else
outputs[0](ranges).setTo(0); outputs[0](ranges).setTo(0);
return true; return true;
} }
@ -360,9 +382,23 @@ public:
} }
CV_Assert(count == numKept); CV_Assert(count == numKept);
} }
if (use_half)
{
UMat half_umat;
convertFp16(umat, half_umat);
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs.clear();
orig_outputs.push_back(half_umat);
outs.assign(orig_outputs);
} else {
outputs.clear(); outputs.clear();
outputs.push_back(umat); outputs.push_back(umat);
outs.assign(outputs); outs.assign(outputs);
}
return true; return true;
} }
#endif #endif
@ -372,7 +408,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -176,7 +176,7 @@ public:
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
func.applyOCL(inputs_arr, outputs_arr, internals_arr)) func.applyOCL(inputs_arr, outputs_arr, internals_arr))
@ -223,7 +223,12 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
static String oclGetTMacro(const UMat &m) static String oclGetTMacro(const UMat &m)
{ {
return String("-DT=") + ocl::typeToStr(m.type()) + String(" "); String str_name = ocl::typeToStr(m.type());
if (str_name == "short")
str_name = "half";
return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
} }
#endif #endif
@ -516,8 +521,28 @@ struct SigmoidFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
@ -561,8 +586,28 @@ struct ELUFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
@ -604,8 +649,28 @@ struct AbsValFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif

View File

@ -271,6 +271,9 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
if (inputs_.depth() == CV_16S && op != SUM)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
@ -284,10 +287,15 @@ public:
{ {
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)channels / 4 * localsize[0] }; size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
String opts;
if (inputs_.depth() == CV_16S)
opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
for (int i = 0; i < (inputs.size() - 1); ++i) for (int i = 0; i < (inputs.size() - 1); ++i)
{ {
String buildopt = format("-DLOOP=%d", i); String buildopt = format("-DLOOP=%d", i) + opts;
ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt); ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
int idx = 0; int idx = 0;
UMat inpMat = (i == 0) ? inputs[0] : UMat(); UMat inpMat = (i == 0) ? inputs[0] : UMat();
@ -306,6 +314,9 @@ public:
} }
else else
{ {
if (inputs_.depth() == CV_16S)
return false;
float coeff1 = coeffs.empty() ? 1.f : coeffs[0]; float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
float coeff2 = coeffs.empty() ? 1.f : coeffs[1]; float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
UMat mul0, mul1; UMat mul0, mul1;
@ -343,7 +354,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -140,7 +140,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
outputs_arr.isUMatVector() && outputs_arr.isUMatVector() &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -64,6 +64,7 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
Ptr<OCL4DNNInnerProduct<float> > innerProductOp; Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
std::vector<UMat> umat_blobs; std::vector<UMat> umat_blobs;
std::vector<UMat> half_blobs;
#endif #endif
FullyConnectedLayerImpl(const LayerParams& params) FullyConnectedLayerImpl(const LayerParams& params)
@ -277,6 +278,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -293,6 +295,17 @@ public:
config.bias_term = bias; config.bias_term = bias;
config.M = outerSize; config.M = outerSize;
config.K = innerSize; config.K = innerSize;
config.use_half = use_half;
if (use_half)
{
half_blobs.resize(umat_blobs.size());
for (int i = 0; i < umat_blobs.size(); i++)
{
if (!umat_blobs[i].empty())
convertFp16(umat_blobs[i], half_blobs[i]);
}
}
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config)); innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
} }
@ -309,13 +322,15 @@ public:
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f); dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat)) if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
dstMat))
{ {
ret = false; ret = false;
break; break;
} }
if (bias && (outerSize > 1)) if (!use_half && bias && (outerSize > 1))
{ {
UMat& biases = umat_blobs[1]; UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
@ -353,7 +368,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -106,6 +106,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -128,6 +129,7 @@ public:
config.height = inputs[0].size[2]; config.height = inputs[0].size[2];
config.width = inputs[0].size[3]; config.width = inputs[0].size[3];
config.norm_by_size = normBySize; config.norm_by_size = normBySize;
config.use_half = use_half;
lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config)); lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
} }
@ -146,7 +148,7 @@ public:
CV_Assert(inputs_arr.total() == outputs_arr.total()); CV_Assert(inputs_arr.total() == outputs_arr.total());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -102,6 +102,9 @@ public:
{ {
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
bool use_half = (inputs[0].depth() == CV_16S);
String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
int splitDim = (acrossChannels) ? 1 : 2; int splitDim = (acrossChannels) ? 1 : 2;
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@ -111,12 +114,11 @@ public:
int newRows = total(shape(inpMat), 0, splitDim); int newRows = total(shape(inpMat), 0, splitDim);
MatShape s = shape(newRows, inpMat.total() / newRows); MatShape s = shape(newRows, inpMat.total() / newRows);
UMat oneMat = UMat::ones(s[1], 1, CV_32F); UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
UMat meanMat = UMat(s[0], 1, CV_32F);
UMat tmpMat = UMat(s[0], s[1], CV_32F); UMat tmpMat = UMat(s[0], s[1], CV_32F);
float alpha = 1.0f / s[1]; float alpha = 1.0f / s[1];
String buildopt = "-DNUM=4"; String buildopt = "-DNUM=4" + opts;
ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt); ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] }; size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
@ -167,13 +169,14 @@ public:
int row_size = total(shape(inputs[0]), 0, splitDim); int row_size = total(shape(inputs[0]), 0, splitDim);
int plane_size = total(shape(inputs[0]), splitDim); int plane_size = total(shape(inputs[0]), splitDim);
if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0)) if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
{ return fast_forward_ocl(inputs, outputs);
bool ret = fast_forward_ocl(inputs, outputs);
return ret; if (inputs[0].depth() == CV_16S)
} return false;
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
{ {
@ -195,7 +198,7 @@ public:
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) }; size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
if (normVariance) if (normVariance)
{ {
String kname = format("calc_mean%d", number); String kname = format("calc_mean%d", number);
@ -249,7 +252,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -87,6 +87,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -162,7 +165,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -288,9 +288,11 @@ public:
if (!_needsPermute) if (!_needsPermute)
return false; return false;
bool use_half = (inps.depth() == CV_16S);
String opts = format("-DDtype=%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc); ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
kernel.set(0, (int)_count); kernel.set(0, (int)_count);
kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i])); kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
@ -313,7 +315,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -147,6 +147,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -164,6 +165,7 @@ public:
(type == AVE ? LIBDNN_POOLING_METHOD_AVE : (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
LIBDNN_POOLING_METHOD_STO); LIBDNN_POOLING_METHOD_STO);
config.avePoolPaddedArea = avePoolPaddedArea; config.avePoolPaddedArea = avePoolPaddedArea;
config.use_half = use_half;
poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config)); poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
} }
@ -189,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -316,6 +316,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -340,9 +341,15 @@ public:
heights.copyTo(umat_heights); heights.copyTo(umat_heights);
} }
size_t nthreads = _layerHeight * _layerWidth; String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
else
opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
size_t nthreads = _layerHeight * _layerWidth;
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
kernel.set(1, (float)_stepX); kernel.set(1, (float)_stepX);
kernel.set(2, (float)_stepY); kernel.set(2, (float)_stepY);
@ -375,7 +382,7 @@ public:
// set the variance. // set the variance.
{ {
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc); ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
int offset = total(shape(outputs[0]), 2); int offset = total(shape(outputs[0]), 2);
size_t nthreads = _layerHeight * _layerWidth * _numPriors; size_t nthreads = _layerHeight * _layerWidth * _numPriors;
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
@ -395,7 +402,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -158,6 +158,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -237,7 +240,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -127,7 +127,7 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
// TODO: implement a logistic activation to classification scores. // TODO: implement a logistic activation to classification scores.
if (useLogistic) if (useLogistic || inps.depth() == CV_16S)
return false; return false;
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
@ -191,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -96,9 +96,10 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
@ -134,7 +135,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -219,7 +219,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -181,6 +181,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
@ -188,6 +189,11 @@ public:
(total(shape(outputs[0]), 2) % 4 != 0)) (total(shape(outputs[0]), 2) % 4 != 0))
return false; return false;
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
const UMat& inpMat = inputs[0]; const UMat& inpMat = inputs[0];
for (size_t i = 0; i < outputs.size(); i++) for (size_t i = 0; i < outputs.size(); i++)
{ {
@ -196,7 +202,7 @@ public:
int rows = outputs[i].size[2]; int rows = outputs[i].size[2];
int cols = outputs[i].size[3]; int cols = outputs[i].size[3];
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc); ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
size_t local[] = { 128 }; size_t local[] = { 128 };
size_t global[] = { (size_t)groups * channels / 4 * local[0] }; size_t global[] = { (size_t)groups * channels / 4 * local[0] };
int idx = 0; int idx = 0;
@ -222,7 +228,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -99,15 +99,16 @@ public:
softmaxOp.release(); softmaxOp.release();
} }
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns) bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
{ {
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
inps.getUMatVector(inputs); bool use_half = (inputs_.depth() == CV_16S);
outs.getUMatVector(outputs); inputs_.getUMatVector(inputs);
itns.getUMatVector(internals); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals);
if (softmaxOp.empty()) if (softmaxOp.empty())
{ {
@ -117,6 +118,7 @@ public:
config.axis = axisRaw; config.axis = axisRaw;
config.channels = inputs[0].size[axisRaw]; config.channels = inputs[0].size[axisRaw];
config.logsoftmax = logSoftMax; config.logsoftmax = logSoftMax;
config.use_half = use_half;
softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config)); softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
} }
@ -128,15 +130,13 @@ public:
return true; return true;
UMat& bufMat = internals[0]; UMat& bufMat = internals[0];
src.copyTo(dstMat);
int axis = clamp(axisRaw, src.dims); int axis = clamp(axisRaw, src.dims);
MatShape s = shape(src); MatShape s = shape(src);
size_t outerSize = total(s, 0, axis); size_t outerSize = total(s, 0, axis);
size_t channels = src.size[axis]; size_t channels = src.size[axis];
size_t innerSize = total(s, axis + 1); size_t innerSize = total(s, axis + 1);
String buildOpts = String("-DT=") + ocl::typeToStr(src.type()); String buildOpts = format("-DT=%s", use_half ? "half" : "float");
ocl::Kernel kmax, ksub, ksum, kdiv; ocl::Kernel kmax, ksub, ksum, kdiv;
if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
@ -152,38 +152,31 @@ public:
if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
return false; return false;
size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t bufSize = internals[0].total(); size_t bufSize = internals[0].total();
size_t totalSize = src.total(); size_t totalSize = src.total();
// adjust local/global size size_t internal_globalSize[1] = { bufSize };
size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize }; size_t total_globalSize[1] = { totalSize };
size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
// adjust local/global size (total)
size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
kmax.args((int)outerSize, (int)channels, (int)innerSize, kmax.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
if (!kmax.run(1, internal_globalSize, internal_localSize, false)) if (!kmax.run(1, internal_globalSize, NULL, false))
return false; return false;
ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat),
if (!ksub.run(1, total_globalSize, total_localSize, false)) ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
if (!ksub.run(1, total_globalSize, NULL, false))
return false; return false;
cv::exp(dstMat, dstMat);
ksum.args((int)outerSize, (int)channels, (int)innerSize, ksum.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
if (!ksum.run(1, internal_globalSize, internal_localSize, false)) if (!ksum.run(1, internal_globalSize, NULL, false))
return false; return false;
kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
if (!kdiv.run(1, total_globalSize, total_localSize, false)) if (!kdiv.run(1, total_globalSize, NULL, false))
return false; return false;
return true; return true;
@ -195,7 +188,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

View File

@ -8,6 +8,8 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "nms.inl.hpp" #include "nms.inl.hpp"
#include <opencv2/imgproc.hpp>
namespace cv namespace cv
{ {
namespace dnn namespace dnn
@ -28,6 +30,27 @@ void NMSBoxes(const std::vector<Rect>& bboxes, const std::vector<float>& scores,
NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap); NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap);
} }
static inline float rotatedRectIOU(const RotatedRect& a, const RotatedRect& b)
{
std::vector<Point2f> inter;
int res = rotatedRectangleIntersection(a, b, inter);
if (inter.empty() || res == INTERSECT_NONE)
return 0.0f;
if (res == INTERSECT_FULL)
return 1.0f;
float interArea = contourArea(inter);
return interArea / (a.size.area() + b.size.area() - interArea);
}
void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
const float score_threshold, const float nms_threshold,
std::vector<int>& indices, const float eta, const int top_k)
{
CV_Assert(bboxes.size() == scores.size(), score_threshold >= 0,
nms_threshold >= 0, eta > 0);
NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rotatedRectIOU);
}
CV__DNN_EXPERIMENTAL_NS_END CV__DNN_EXPERIMENTAL_NS_END
}// dnn }// dnn
}// cv }// cv

View File

@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
stride(1, 1), stride(1, 1),
dilation(1, 1), dilation(1, 1),
group(1), group(1),
bias_term(false) bias_term(false),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
Size dilation; Size dilation;
int group; // = 1; int group; // = 1;
bool bias_term; // = false; bool bias_term; // = false;
bool use_half; // = false;
}; };
typedef enum { typedef enum {
@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
UMat swizzled_weights_umat; UMat swizzled_weights_umat;
UMat weights_half;
UMat bias_half;
UMat bottom_data2_; UMat bottom_data2_;
int32_t bottom_index_; int32_t bottom_index_;
@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
ocl4dnnFusedActiv_t fused_activ_; ocl4dnnFusedActiv_t fused_activ_;
float power_; float power_;
bool fused_eltwise_; bool fused_eltwise_;
bool use_half_;
}; };
typedef enum { typedef enum {
@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
channels(0), channels(0),
pool_method(LIBDNN_POOLING_METHOD_MAX), pool_method(LIBDNN_POOLING_METHOD_MAX),
global_pooling(false), global_pooling(false),
avePoolPaddedArea(false) avePoolPaddedArea(true),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX; ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
bool global_pooling; // = false; bool global_pooling; // = false;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -391,13 +398,14 @@ class OCL4DNNPool
int32_t pooled_height_; int32_t pooled_height_;
int32_t pooled_width_; int32_t pooled_width_;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
struct OCL4DNNInnerProductConfig struct OCL4DNNInnerProductConfig
{ {
OCL4DNNInnerProductConfig() : OCL4DNNInnerProductConfig() :
num_output(0), M(0), K(0), num_output(0), M(0), K(0),
bias_term(false), transpose(false), phase_test(true) bias_term(false), transpose(false), phase_test(true), use_half(false)
{} {}
int num_output; int num_output;
int M; int M;
@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
bool bias_term; bool bias_term;
bool transpose; // = false; bool transpose; // = false;
bool phase_test; // = true; bool phase_test; // = true;
bool use_half; // = false;
}; };
template<typename Dtype> template<typename Dtype>
@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
bool transpose_; bool transpose_;
bool image_copied_; bool image_copied_;
bool phase_test_; bool phase_test_;
bool use_half_;
}; };
typedef enum { typedef enum {
@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS), lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
phase_test(true), phase_test(true),
local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false), local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
batch_size(0), channels(0), height(0), width(0) batch_size(0), channels(0), height(0), width(0), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type; LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
int32_t channels; int32_t channels;
int32_t height; int32_t height;
int32_t width; int32_t width;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -477,16 +488,18 @@ class OCL4DNNLRN
int32_t height_; int32_t height_;
int32_t width_; int32_t width_;
bool norm_by_size_; bool norm_by_size_;
bool use_half_;
}; };
struct OCL4DNNSoftmaxConfig struct OCL4DNNSoftmaxConfig
{ {
OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false) OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
int axis; int axis;
int channels; int channels;
bool logsoftmax; bool logsoftmax;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -506,6 +519,7 @@ class OCL4DNNSoftmax
bool use_slm_; bool use_slm_;
bool log_softmax_; bool log_softmax_;
UMat scale_data_; UMat scale_data_;
bool use_half_;
}; };
}}} // namespace cv::dnn::ocl4dnn }}} // namespace cv::dnn::ocl4dnn

View File

@ -48,6 +48,12 @@
namespace cv { namespace dnn { namespace ocl4dnn { namespace cv { namespace dnn { namespace ocl4dnn {
enum gemm_data_type_t
{
TYPE_FLOAT = 1,
TYPE_HALF = 2
};
// Create and copy buffer to image for GEMM's matrix A and B. // Create and copy buffer to image for GEMM's matrix A and B.
// Will return image to caller if the input image is NULL. Otherwise, // Will return image to caller if the input image is NULL. Otherwise,
// will use the image directly. It's caller's responsibility to // will use the image directly. It's caller's responsibility to
@ -60,6 +66,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
int width, int ld) int width, int ld)
{ {
ocl::Image2D image; ocl::Image2D image;
String opts = format("-DTYPE=%d", TYPE_FLOAT);
if (!is_matrix_a && transpose) if (!is_matrix_a && transpose)
{ {
@ -73,7 +80,8 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
UMat mat(height, width, CV_32FC1); UMat mat(height, width, CV_32FC1);
image = ocl::Image2D(mat); image = ocl::Image2D(mat);
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc); ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float",
ocl::dnn::gemm_image_oclsrc, opts);
size_t global_copy[2]; size_t global_copy[2];
global_copy[0] = width; global_copy[0] = width;
@ -96,7 +104,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
image = ocl::Image2D(mat); image = ocl::Image2D(mat);
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float", ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
ocl::dnn::gemm_image_oclsrc); ocl::dnn::gemm_image_oclsrc, opts);
size_t global_copy[2]; size_t global_copy[2];
global_copy[0] = padded_width; global_copy[0] = padded_width;
@ -129,7 +137,7 @@ enum gemm_type_t
GEMM_TYPE_FAST_IMAGE_32_1, GEMM_TYPE_FAST_IMAGE_32_1,
GEMM_TYPE_FAST_IMAGE_32_2, GEMM_TYPE_FAST_IMAGE_32_2,
GEMM_TYPE_FAST_IMAGE_B_IMAGE, GEMM_TYPE_FAST_IMAGE_B_IMAGE,
GEMM_TYPE_MAX GEMM_TYPE_FAST_BUFFER
}; };
template<typename Dtype> template<typename Dtype>
@ -145,6 +153,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 || CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl; gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
bool halfPrecisionMode = (A.depth() == CV_16S);
if (is_image_a) if (is_image_a)
{ {
CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl; CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
@ -157,6 +167,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
return false; return false;
} }
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
int widthA = (TransA == CblasNoTrans) ? K : M; int widthA = (TransA == CblasNoTrans) ? K : M;
int heightA = (TransA == CblasNoTrans) ? M : K; int heightA = (TransA == CblasNoTrans) ? M : K;
int widthB = (TransB == CblasNoTrans) ? N : K; int widthB = (TransB == CblasNoTrans) ? N : K;
@ -178,7 +189,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
int blockC_width = blocksize; int blockC_width = blocksize;
int blockC_height = blocksize; int blockC_height = blocksize;
int use_buffer_indicator = 8; int use_buffer_indicator = (halfPrecisionMode) ? 16 : 8;
// To fix the edge problem caused by the sub group block read. // To fix the edge problem caused by the sub group block read.
// we have to pad the image if it's not multiple of tile. // we have to pad the image if it's not multiple of tile.
// just padding one line is enough as the sub group block read // just padding one line is enough as the sub group block read
@ -221,9 +232,13 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
else else
kernel_name += "1"; kernel_name += "1";
if (halfPrecisionMode) {
kernel_name += "_half";
} else {
kernel_name += "_float"; kernel_name += "_float";
}
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc); ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc, opts);
if (oclk_gemm_float.empty()) if (oclk_gemm_float.empty())
return false; return false;
@ -255,6 +270,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
bool padding_A = false; bool padding_A = false;
bool padding_B = false; bool padding_B = false;
if (halfPrecisionMode && is_image_b) {
padding_A = true;
}
if (!is_image_a && !is_image_b) if (!is_image_a && !is_image_b)
{ {
if (M * K < N * K) if (M * K < N * K)
@ -265,6 +284,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
if (!is_image_a) if (!is_image_a)
{ {
if (!halfPrecisionMode)
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset, ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
true, TransA != CblasNoTrans, true, TransA != CblasNoTrans,
padding_A, imageA_h, imageA_w, padding_A, imageA_h, imageA_w,
@ -272,6 +292,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
} }
if (!is_image_b) if (!is_image_b)
{ {
if (!halfPrecisionMode)
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset, ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
false, false, false, false,
padding_B, imageB_h, imageB_w, padding_B, imageB_h, imageB_w,
@ -283,7 +304,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
if (!is_image_a) if (!is_image_a)
{ {
bool padding; bool padding;
padding = !is_image_b; padding = !is_image_b || halfPrecisionMode;
if (!halfPrecisionMode)
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset, ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
true, TransA != CblasNoTrans, true, TransA != CblasNoTrans,
padding, imageA_h, imageA_w, padding, imageA_h, imageA_w,
@ -292,8 +314,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
if (!is_image_b && (K % use_buffer_indicator != 0)) if (!is_image_b && (K % use_buffer_indicator != 0))
{ {
if (!halfPrecisionMode)
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset, ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
false, true, false, imageB_h, imageB_w, false, true, false,
imageB_h, imageB_w,
blockB_height, blockB_width, ldB); blockB_height, blockB_width, ldB);
} }
} }
@ -301,14 +325,27 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
size_t global[2]; size_t global[2];
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
{ {
if (halfPrecisionMode) {
global[0] = (size_t)( blockC_width + 15 ) & ~15;
} else {
global[0] = (size_t)( blockC_width + 7 ) & ~7; global[0] = (size_t)( blockC_width + 7 ) & ~7;
}
} else {
if (halfPrecisionMode) {
global[0] = (size_t)( (blockC_width / 2 ) + 15 ) ^ ~15;
} else { } else {
global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7; global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
} }
}
global[1] = (size_t)(blockC_height + 31) / 32; global[1] = (size_t)(blockC_height + 31) / 32;
size_t local[2]; size_t local[2];
if (halfPrecisionMode)
{
local[0] = 16;
} else {
local[0] = 8; local[0] = 8;
}
local[1] = 1; local[1] = 1;
cl_uint arg_idx = 0; cl_uint arg_idx = 0;
@ -385,6 +422,101 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
return true; return true;
} }
template<typename Dtype>
static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int32_t M,
const int32_t N, const int32_t K, const Dtype alpha,
const UMat A, const int32_t offA, const UMat B,
const int32_t offB, const Dtype beta, UMat C,
const int32_t offC, enum gemm_type_t gemm_type)
{
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
<< "Invalid fast buffer gemm type." << std::endl;
bool halfPrecisionMode = (A.depth() == CV_16S);
size_t sub_group_size = 8;
bool is_small_batch = (M == 2 || M == 4 || M == 8);
String kernel_name("gemm_buffer_");
if (TransA == CblasNoTrans && TransB == CblasNoTrans) {
kernel_name += "NN";
if (halfPrecisionMode) {
sub_group_size = 16;
}
} else if (TransA == CblasNoTrans && TransB != CblasNoTrans) {
if (M == 2)
kernel_name +="NT_M_2";
else if (M == 4)
kernel_name +="NT_M_4";
else if (M == 8)
kernel_name +="NT_M_8";
else
kernel_name += "NT";
}
if (halfPrecisionMode) {
kernel_name += "_half";
} else {
kernel_name += "_float";
}
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
size_t local[2] = {};
size_t global[2] = {};
if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
if (M == 8)
local[0] = 16;
else if (M == 4)
local[0] = 32;
else
local[0] = 64;
local[1] = 1;
if (M == 8)
global[0] = N * local[0];
else
global[0] = (N + 3) / 4 * local[0];
global[1] = 1;
} else {
size_t lx = sub_group_size;
size_t ly = (TransB != CblasNoTrans && TransA == CblasNoTrans && halfPrecisionMode) ? 2 : 4;
int dx = (TransB != CblasNoTrans && TransA == CblasNoTrans) ? 1 : 4;
int dy = 8;
size_t gx = (size_t)(N + dx - 1) / dx;
size_t gy = (size_t)(M + dy - 1) / dy;
global[0] = (gx + lx - 1) / lx * lx;
global[1] = (gy + ly - 1) / ly * ly;
local[0] = lx;
local[1] = ly;
}
int arg_idx = 0;
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
oclk_gemm_float.set(arg_idx++, offA);
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
oclk_gemm_float.set(arg_idx++, offB);
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
oclk_gemm_float.set(arg_idx++, offC);
oclk_gemm_float.set(arg_idx++, M);
oclk_gemm_float.set(arg_idx++, N);
oclk_gemm_float.set(arg_idx++, K);
oclk_gemm_float.set(arg_idx++, (float)alpha);
oclk_gemm_float.set(arg_idx++, (float)beta);
bool ret;
if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
int stride = 256;
for (int start_index = 0; start_index < K; start_index += stride) {
oclk_gemm_float.set(arg_idx, start_index);
ret = oclk_gemm_float.run(2, global, local, false);
}
} else {
ret = oclk_gemm_float.run(2, global, local, false);
}
return ret;
}
template<typename Dtype> template<typename Dtype>
bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB, bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
const int32_t M, const int32_t N, const int32_t K, const int32_t M, const int32_t N, const int32_t K,
@ -392,7 +524,8 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
const UMat B_image, UMat C, const UMat B_image, UMat C,
const size_t max_image_size) const size_t max_image_size)
{ {
gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1; bool halfPrecisionMode = (A.depth() == CV_16S);
gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
gemm_type == GEMM_TYPE_FAST_IMAGE_32_2) gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
@ -409,6 +542,11 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
GEMM_TYPE_FAST_IMAGE_B_IMAGE, GEMM_TYPE_FAST_IMAGE_B_IMAGE,
max_image_size); max_image_size);
} }
else if (gemm_type == GEMM_TYPE_FAST_BUFFER)
{
return ocl4dnnFastBufferGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
1.f, A, 0, B, 0, 0.f, C, 0, gemm_type);
}
return false; return false;
} }
@ -436,10 +574,17 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
const int32_t offy) const int32_t offy)
{ {
bool ret = false; bool ret = false;
bool use_half = (A.depth() == CV_16S);
String opts;
if (use_half)
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
else
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "float", "float4", "float");
if (TransA == CblasNoTrans) if (TransA == CblasNoTrans)
{ {
ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc); String kname = format("matvec_mul4_%s", use_half ? "half" : "float");
ocl::Kernel k(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
if (k.empty()) if (k.empty())
return false; return false;
@ -469,7 +614,8 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
if ((row_size % 4) != 0 && ret) if ((row_size % 4) != 0 && ret)
{ {
ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc); String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { row_size % 4 * localsize[0] }; size_t globalsize[] = { row_size % 4 * localsize[0] };
uint row_offset = row_size - (row_size % 4); uint row_offset = row_size - (row_size % 4);
@ -499,7 +645,15 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
const UMat X, const int32_t offX, UMat Y, const UMat X, const int32_t offX, UMat Y,
const int32_t offY) const int32_t offY)
{ {
ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc); bool use_half = (X.depth() == CV_16S);
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
else
opts = "-DDtype=float -DDtype4=float4 -Dconvert_Dtype=convert_float";
String kname = format("axpy_%s", use_half ? "half" : "float");
ocl::Kernel oclk_axpy(kname.c_str(), cv::ocl::dnn::math_oclsrc, opts);
if (oclk_axpy.empty()) if (oclk_axpy.empty())
return false; return false;

View File

@ -54,6 +54,7 @@
#include "opencl_kernels_dnn.hpp" #include "opencl_kernels_dnn.hpp"
#include "../include/math_functions.hpp" #include "../include/math_functions.hpp"
#include "../include/default_kernel_config.hpp" #include "../include/default_kernel_config.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#if defined WIN32 || defined _WIN32 #if defined WIN32 || defined _WIN32
#include <windows.h> #include <windows.h>
@ -85,6 +86,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
max_value_ = 0; max_value_ = 0;
prev_kernel_type_ = -1; prev_kernel_type_ = -1;
tuned_ = false; tuned_ = false;
use_half_ = config.use_half;
// assumption: spatial dimension is 2. // assumption: spatial dimension is 2.
kernel_h_ = config.kernel.height; kernel_h_ = config.kernel.height;
@ -204,9 +206,30 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
return; return;
} }
typedef enum {
TYPE_FLOAT = 1,
TYPE_HALF = 2
} ocl4dnnConvSpatialType_t;
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation() void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
{ {
if (use_half_)
{
addDef("TYPE", TYPE_HALF);
addDef("Dtype", "half");
addDef("Dtype2", "half2");
addDef("Dtype4", "half4");
addDef("Dtype8", "half8");
addDef("Dtype16", "half16");
addDef("as_Dtype", "as_half");
addDef("as_Dtype2", "as_half2");
addDef("as_Dtype4", "as_half4");
addDef("as_Dtype8", "as_half8");
}
else
{
addDef("TYPE", TYPE_FLOAT);
addDef("Dtype", "float"); addDef("Dtype", "float");
addDef("Dtype2", "float2"); addDef("Dtype2", "float2");
addDef("Dtype4", "float4"); addDef("Dtype4", "float4");
@ -217,6 +240,7 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
addDef("as_Dtype4", "as_float4"); addDef("as_Dtype4", "as_float4");
addDef("as_Dtype8", "as_float8"); addDef("as_Dtype8", "as_float8");
} }
}
typedef enum { typedef enum {
KERNEL_TYPE_INTEL_IDLF = 2, KERNEL_TYPE_INTEL_IDLF = 2,
@ -477,10 +501,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
fused_eltwise_ = false; fused_eltwise_ = false;
} }
prepareKernel(bottom, top, weight, bias, numImages); if (use_half_ && bias_half.empty() && !bias.empty())
convertFp16((UMat&)bias, bias_half);
if (use_half_ && weights_half.empty())
convertFp16((UMat&)weight, weights_half);
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
if (bestKernelConfig.empty()) if (bestKernelConfig.empty())
return false; return false;
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig); return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
} }
template<typename Dtype> template<typename Dtype>
@ -556,6 +586,12 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
<< "_" << blockWidth << "_" << blockWidth
<< "_" << blockHeight << "_" << blockHeight
<< "_" << blockDepth; << "_" << blockDepth;
if (!use_half_)
keyBuilder << "_float";
else
keyBuilder << "_half";
return keyBuilder.str(); return keyBuilder.str();
} }
@ -637,9 +673,13 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
if (swizzled_weights_umat.empty()) if (swizzled_weights_umat.empty())
swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ * swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1); kernel_h_ * (int)alignSize(kernel_w_, 2),
(use_half_) ? CV_16SC1 : CV_32FC1);
UMat swizzled_weights_tmp;
if (use_half_)
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
ocl::Queue queue = ocl::Queue::getDefault();
if (!interleave) { if (!interleave) {
cl_uint argIdx = 0; cl_uint argIdx = 0;
int32_t channels = channels_ / group_; int32_t channels = channels_ / group_;
@ -650,6 +690,9 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
return false; return false;
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (use_half_)
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
else
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat)); oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
oclk_copy_weight.set(argIdx++, kernel_w_); oclk_copy_weight.set(argIdx++, kernel_w_);
oclk_copy_weight.set(argIdx++, kernel_h_); oclk_copy_weight.set(argIdx++, kernel_h_);
@ -669,7 +712,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
// assumption: kernel dimesion is 2 // assumption: kernel dimesion is 2
Mat weightMat = weight.getMat(ACCESS_READ); Mat weightMat = weight.getMat(ACCESS_READ);
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>(); Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE); Mat swizzledWeightMat;
if (use_half_)
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
else
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>(); Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
int interleavedRows = (kernel_w_ / 2) * 2; int interleavedRows = (kernel_w_ / 2) * 2;
@ -694,6 +741,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
rowAlignment); rowAlignment);
free(tmpSwizzledWeight); free(tmpSwizzledWeight);
} }
if (use_half_)
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
return true; return true;
} }
@ -727,9 +778,10 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
cl_mem sub_mem; cl_mem sub_mem;
cl_buffer_region region; cl_buffer_region region;
cl_int err; cl_int err;
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
region.origin = offset * sizeof(float); region.origin = offset * element_size;
region.size = size * sizeof(float); region.size = size * element_size;
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ), sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY, write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err); CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
@ -739,8 +791,9 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
return; return;
} }
int step = sizeof(float), rows = size, cols = 1; int step = element_size, rows = size, cols = 1;
ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer); ocl::convertFromBuffer(sub_mem, step, rows, cols,
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
//decrease ocl mem refcount //decrease ocl mem refcount
clReleaseMemObject(sub_mem); clReleaseMemObject(sub_mem);
@ -978,6 +1031,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_) if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1018,6 +1074,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, kernel_offset); kernel.set(argIdx++, kernel_offset);
if (bias_term_) if (bias_term_)
@ -1132,14 +1191,27 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
return false; return false;
int32_t sz[4] = {numImages, num_output_, output_h_, output_w_}; int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
top.zeros(4, sz, CV_32FC1); top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
bool saved_tuned = tuned_; bool saved_tuned = tuned_;
tuned_ = false; tuned_ = false;
convolve(bottom, top, weight, bias, numImages, config); convolve(bottom, top, weight, bias, numImages, config);
tuned_ = saved_tuned; tuned_ = saved_tuned;
float *data = (float *)top.getMat(ACCESS_READ).ptr<float>(); UMat new_top, new_verify_top;
float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>(); float *data, *verify_data;
if (use_half_)
{
convertFp16(top, new_top);
convertFp16(verifyTop, new_verify_top);
data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
}
else
{
data = (float *)top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
}
for (int32_t n = 0; n < num_; ++n) { for (int32_t n = 0; n < num_; ++n) {
for (int32_t g = 0; g < group_; ++g) { for (int32_t g = 0; g < group_; ++g) {
@ -1148,9 +1220,19 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
for (int h = 0; h < output_h_ && !verificationFail; h++) for (int h = 0; h < output_h_ && !verificationFail; h++)
for (int w = 0; w < output_w_; w++) { for (int w = 0; w < output_w_; w++) {
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && float error_factor = fabs(data[offset] - verify_data[offset]);
fabs(data[offset] - verify_data[offset]) < 1.e-4)) if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n",
n, g, out_ch, h, w, data[offset], verify_data[offset]));
verificationFail = 1;
goto out;
}
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{ {
dbgPrint(printf("test verification failed @ image %d group %d" dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n", "out_ch %d h %d w %d got %G expected %G\n",
@ -1719,15 +1801,16 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
if (loadTunedConfig()) // check external storage if (loadTunedConfig()) // check external storage
return; return;
UMat benchData(1, numImages * top_dim_, CV_32FC1); UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
if (force_auto_tuning_) if (force_auto_tuning_)
{ {
calculateBenchmark(bottom, benchData, weight, bias, numImages);
setupConvolution(bottom, top, weight, bias, numImages, benchData); setupConvolution(bottom, top, weight, bias, numImages, benchData);
} }
else else
{ {
calculateBenchmark(bottom, benchData, weight, bias, numImages);
useFirstAvailable(bottom, top, weight, bias, numImages, benchData); useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
} }
cacheTunedConfig(); cacheTunedConfig();

View File

@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
K_ = config.K; K_ = config.K;
phase_test_ = config.phase_test; phase_test_ = config.phase_test;
image_copied_ = false; image_copied_ = false;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
if (M_ <= max_image_size && if (M_ <= max_image_size &&
N_ <= max_image_size && N_ <= max_image_size &&
K_ <= max_image_size && K_ <= max_image_size &&
cv::traits::Depth<Dtype>::value == CV_32F &&
ocl::Device::getDefault().intelSubgroupsSupport()) ocl::Device::getDefault().intelSubgroupsSupport())
{ {
ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans, ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
M_, N_, K_, bottom, weight, UMat(), top, M_, N_, K_, bottom, weight, UMat(), top,
max_image_size); max_image_size);
} }
if (use_half_ && bias_term_)
{
UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
UMat newbias, tmpTop;
convertFp16(bias, newbias);
convertFp16(top, tmpTop);
cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
convertFp16(tmpTop, top);
}
return ret; return ret;
} }
} }

View File

@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
channels_ = config.channels; channels_ = config.channels;
height_ = config.height; height_ = config.height;
width_ = config.width; width_ = config.width;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
int32_t n_threads = num_ * height_ * width_; int32_t n_threads = num_ * height_ * width_;
size_t global_work_size_[1] = {(size_t)n_threads}; size_t global_work_size_[1] = {(size_t)n_threads};
String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : ""; String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
ocl::Kernel oclk_lrn_fill; ocl::Kernel oclk_lrn_fill;
if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts)) String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
return false; return false;
oclk_lrn_fill.set(argIdx++, n_threads); oclk_lrn_fill.set(argIdx++, n_threads);

View File

@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
channels_ = config.channels; channels_ = config.channels;
pool_method_ = config.pool_method; pool_method_ = config.pool_method;
avePoolPaddedArea = config.avePoolPaddedArea; avePoolPaddedArea = config.avePoolPaddedArea;
use_half = config.use_half;
for (int i = 0; i < spatial_dims; ++i) for (int i = 0; i < spatial_dims; ++i)
{ {
@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
case LIBDNN_POOLING_METHOD_MAX: case LIBDNN_POOLING_METHOD_MAX:
{ {
bool haveMask = !top_mask.empty(); bool haveMask = !top_mask.empty();
String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
kname += (use_half) ? "_half" : "_float";
ocl::Kernel oclk_max_pool_forward( ocl::Kernel oclk_max_pool_forward(
haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"), kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_ave_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_sto_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d", " -D STRIDE_W=%d -D STRIDE_H=%d",

View File

@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
softmax_axis_ = config.axis; softmax_axis_ = config.axis;
channels_ = config.channels; channels_ = config.channels;
log_softmax_ = config.logsoftmax; log_softmax_ = config.logsoftmax;
use_half_ = config.use_half;
inner_num_ = 1; inner_num_ = 1;
outer_num_ = 1; outer_num_ = 1;
@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
if (log_softmax_) opts += " -DLOG_SOFTMAX "; if (log_softmax_) opts += " -DLOG_SOFTMAX ";
if (use_slm_) if (use_slm_)
kname = CL_KERNEL_SELECT("softmax_forward_slm"); kname = "softmax_forward_slm";
else else
kname = CL_KERNEL_SELECT("softmax_forward"); kname = "softmax_forward";
kname += format("%s", (use_half_) ? "_half" : "_float");
opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
(use_half_) ? "HALF_MAX" : "FLT_MAX");
if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts)) if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
return false; return false;

View File

@ -40,9 +40,17 @@
// //
//M*/ //M*/
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
#define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void ReLUForward(const int count, __global const T* in, __global T* out __kernel void ReLUForward(const int count, __global const T* in, __global T* out
#ifndef RELU_NO_SLOPE #ifndef RELU_NO_SLOPE
, T negative_slope , KERNEL_ARG_DTYPE negative_slope
#endif #endif
) { ) {
int index = get_global_id(0); int index = get_global_id(0);
@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
} }
__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out, __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
const T minValue, const T maxValue) const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
{ {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) if(index < count)
{ {
T x = in[index]; T x = in[index];
out[index] = clamp(x, minValue, maxValue); out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
} }
} }
__kernel void PReLUForward(const int count, const int channels, const int plane_size, __kernel void PReLUForward(const int count, const int channels, const int plane_size,
__global const T* in, __global T* out, __global const T* slope_data) __global const T* in, __global T* out,
__global const KERNEL_ARG_DTYPE* slope_data)
{ {
int index = get_global_id(0); int index = get_global_id(0);
int c = (index / plane_size) % channels; int c = (index / plane_size) % channels;
@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
out[index] = fabs(in[index]); out[index] = fabs(in[index]);
} }
__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) { __kernel void PowForward(const int n, __global const T* in, __global T* out,
const KERNEL_ARG_DTYPE power,
const KERNEL_ARG_DTYPE scale,
const KERNEL_ARG_DTYPE shift)
{
int index = get_global_id(0); int index = get_global_id(0);
if (index < n) if (index < n)
out[index] = pow(shift + scale * in[index], power); out[index] = pow(shift + scale * in[index], power);
} }
__kernel void ELUForward(const int n, __global const T* in, __global T* out)
{
int index = get_global_id(0);
if (index < n)
{
T src = in[index];
out[index] = (src >= 0.f) ? src : exp(src) - 1;
}
}

View File

@ -40,24 +40,27 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
#if NUM == 8 #if NUM == 8
#define load(src, index) vload8(0, src + index) #define load(src, index) vload8(0, src + index)
#define store(vec, dst, index) vstore8(vec, 0, dst + index) #define store(vec, dst, index) vstore8(vec, 0, dst + index)
#define vec_type Dtype8 #define float_type float8
#define convert_f convert_float8
#define BATCH_NORM batch_norm8 #define BATCH_NORM batch_norm8
#elif NUM == 4 #elif NUM == 4
#define load(src, index) vload4(0, src + index) #define load(src, index) vload4(0, src + index)
#define store(vec, dst, index) vstore4(vec, 0, dst + index) #define store(vec, dst, index) vstore4(vec, 0, dst + index)
#define vec_type Dtype4 #define float_type float4
#define convert_f convert_float4
#define BATCH_NORM batch_norm4 #define BATCH_NORM batch_norm4
#elif NUM == 1 #elif NUM == 1
#define load(src, index) src[index] #define load(src, index) src[index]
#define store(vec, dst, index) dst[index] = vec #define store(vec, dst, index) dst[index] = vec
#define vec_type Dtype #define float_type float
#define convert_f convert_float
#define BATCH_NORM batch_norm1 #define BATCH_NORM batch_norm1
#endif #endif
@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
const int rows, const int rows,
const int cols, const int cols,
const int channels, const int channels,
__global const Dtype* weight, __global const float* weight,
__global const Dtype* bias, __global const float* bias,
__global Dtype* dst) __global Dtype* dst)
{ {
int x = get_global_id(0); int x = get_global_id(0);
@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
if (x >= rows || y >= cols) if (x >= rows || y >= cols)
return; return;
Dtype w = weight[x % channels]; float w = weight[x % channels];
Dtype b = bias[x % channels]; float b = bias[x % channels];
vec_type src_vec = load(src, index); float_type src_vec = convert_f(load(src, index));
vec_type dst_vec = src_vec * w + (vec_type)b; float_type dst_vec = src_vec * w + (float_type)b;
store(dst_vec, dst, index); store(convert_T(dst_vec), dst, index);
} }

View File

@ -39,22 +39,29 @@
// //
//M*/ //M*/
__kernel void concat(const int nthreads, #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
__global const Dtype* in_data, __global const Dtype* in_data,
const int num_concats, const int num_concats,
const int concat_size, const int concat_size,
const int top_concat_axis, const int top_concat_axis,
const int bottom_concat_axis, const int bottom_concat_axis,
const int offset_concat_axis, const int offset_concat_axis,
__global Dtype* out_data) { __global Dtype* out_data)
{
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
index += get_global_size(0)) { {
const int total_concat_size = concat_size * bottom_concat_axis; const int total_concat_size = concat_size * bottom_concat_axis;
const int concat_num = index / total_concat_size; const int concat_num = index / total_concat_size;
const int concat_index = index % total_concat_size; const int concat_index = index % total_concat_size;
const int top_index = concat_index const int top_index = concat_index +
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size; (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
out_data[top_index] = in_data[index]; out_data[top_index] = in_data[index];
} }
} }

View File

@ -40,27 +40,29 @@
// //
//M*/ //M*/
#if APPLY_BIAS #if defined(cl_khr_fp16)
#define BIAS_KERNEL_ARG __global Dtype * biases_base, #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#else
#define BIAS_KERNEL_ARG
#endif #endif
#define KERNEL_ARG_DTYPE float
#define TYPE_FLOAT 1
#define TYPE_HALF 2
#if defined(FUSED_CONV_RELU) #if defined(FUSED_CONV_RELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
#define FUSED_ARG Dtype negative_slope, #define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
#elif defined(FUSED_CONV_PRELU) #elif defined(FUSED_CONV_PRELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c]))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
#define FUSED_ARG __global const Dtype *negative_slope, #define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
#elif defined(FUSED_CONV_POWER) #elif defined(FUSED_CONV_POWER)
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power) #define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
#define FUSED_ARG Dtype power, #define FUSED_ARG KERNEL_ARG_DTYPE power,
#elif defined(FUSED_CONV_TANH) #elif defined(FUSED_CONV_TANH)
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x) #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
#define FUSED_ARG #define FUSED_ARG
#elif defined(FUSED_CONV_RELU6) #elif defined(FUSED_CONV_RELU6)
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value)) #define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
#define FUSED_ARG Dtype min_value, Dtype max_value, #define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
#else #else
#define ACTIVATION_RELU_FUNCTION(x, c) (x) #define ACTIVATION_RELU_FUNCTION(x, c) (x)
#define FUSED_ARG #define FUSED_ARG
@ -74,6 +76,11 @@
#define ELTWISE_DATA_ARG #define ELTWISE_DATA_ARG
#endif #endif
#if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
#else
#define BIAS_KERNEL_ARG
#endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y) #define CAT(x, y) __CAT(x, y)
@ -97,6 +104,16 @@
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#if defined(convolve_simd) || defined(Conv_Interleaved) #if defined(convolve_simd) || defined(Conv_Interleaved)
#if TYPE == TYPE_HALF
#define INT_TYPE ushort
#define INT_TYPE2 ushort2
#define INT_TYPE4 ushort4
#define INT_TYPE8 ushort8
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
#else
#define INT_TYPE uint #define INT_TYPE uint
#define INT_TYPE2 uint2 #define INT_TYPE2 uint2
#define INT_TYPE4 uint4 #define INT_TYPE4 uint4
@ -106,6 +123,7 @@
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
#endif #endif
#endif
#ifdef KERNEL_BASIC #ifdef KERNEL_BASIC
@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
typedef struct float0 { float s0; } float0; //never used but makes compiler happy. typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
typedef struct half1 { half s0; } half1;
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; } half10;
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; } half11;
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; } half12;
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
#define OUT_PITCH_X output_width #define OUT_PITCH_X output_width
#define ROW_PITCH input_width #define ROW_PITCH input_width

View File

@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void op_sum4(__global const Dtype * A, __kernel void op_sum4(__global const Dtype * A,
__global const Dtype * B, __global const Dtype * B,
@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
a2 = vload4(i, src0_read + 2 * A_col_size); a2 = vload4(i, src0_read + 2 * A_col_size);
a3 = vload4(i, src0_read + 3 * A_col_size); a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = a0 * coeff1 + b0 * coeff2; dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
dot1 = a1 * coeff1 + b1 * coeff2; dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
dot2 = a2 * coeff1 + b2 * coeff2; dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
dot3 = a3 * coeff1 + b3 * coeff2; dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
#else #else
a0 = vload4(i, dst0_read); a0 = vload4(i, dst0_read);
a1 = vload4(i, dst0_read + A_col_size); a1 = vload4(i, dst0_read + A_col_size);
a2 = vload4(i, dst0_read + 2 * A_col_size); a2 = vload4(i, dst0_read + 2 * A_col_size);
a3 = vload4(i, dst0_read + 3 * A_col_size); a3 = vload4(i, dst0_read + 3 * A_col_size);
dot0 = a0 + b0 * coeff2; dot0 = a0 + b0 * (Dtype4)coeff2;
dot1 = a1 + b1 * coeff2; dot1 = a1 + b1 * (Dtype4)coeff2;
dot2 = a2 + b2 * coeff2; dot2 = a2 + b2 * (Dtype4)coeff2;
dot3 = a3 + b3 * coeff2; dot3 = a3 + b3 * (Dtype4)coeff2;
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);

File diff suppressed because it is too large Load Diff

View File

@ -39,24 +39,42 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
// Types used for parameters, offset computations and so on #define KERNEL_ARG_DTYPE float
#define int_tp int #define TYPE_FLOAT 1
#define uint_tp unsigned int #define TYPE_HALF 2
#if TYPE == TYPE_HALF
#define Dtype half
#define Dtype2 half2
#define Dtype4 half4
#define Dtype8 half8
#define Dtype16 half16
#define as_Dtype as_half
#define as_Dtype2 as_half2
#define as_Dtype4 as_half4
#define as_Dtype8 as_half8
#define as_Dtype16 as_half16
#else
#define Dtype float #define Dtype float
#define Dtype2 float2 #define Dtype2 float2
#define Dtype4 float4 #define Dtype4 float4
#define Dtype8 float8 #define Dtype8 float8
#define Dtype16 float16
#define as_Dtype as_float #define as_Dtype as_float
#define as_Dtype2 as_float2 #define as_Dtype2 as_float2
#define as_Dtype4 as_float4 #define as_Dtype4 as_float4
#define as_Dtype8 as_float8 #define as_Dtype8 as_float8
#define as_Dtype16 as_float16
#define KERNEL_ARG_DTYPE float #endif
#if defined(cl_intel_subgroups) #if defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable #pragma OPENCL EXTENSION cl_intel_subgroups : enable
@ -67,6 +85,15 @@
// common block to calculate (alpha * AxB + beta * C) and output to destination image. // common block to calculate (alpha * AxB + beta * C) and output to destination image.
#if TYPE == TYPE_HALF
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
#define SHUFFLE_TYPE2(val) as_ushort2(val)
#define SHUFFLE_TYPE8(val) as_ushort8(val)
#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
#define SIZE_OF_ELEMENT sizeof(ushort)
#define SIMD_SIZE_GEMM 16
#define TILE_N 16
#else
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord ) #define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
#define SHUFFLE_TYPE2(val) val #define SHUFFLE_TYPE2(val) val
#define SHUFFLE_TYPE8(val) val #define SHUFFLE_TYPE8(val) val
@ -74,11 +101,17 @@
#define SIZE_OF_ELEMENT sizeof(uint) #define SIZE_OF_ELEMENT sizeof(uint)
#define SIMD_SIZE_GEMM 8 #define SIMD_SIZE_GEMM 8
#define TILE_N 8 #define TILE_N 8
#endif
//#define USE_IMAGE_C //#define USE_IMAGE_C
#ifdef USE_IMAGE_C #ifdef USE_IMAGE_C
#if TYPE == TYPE_HALF
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
#else
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) ) #define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) ) #define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
#endif
#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst #define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint)) #define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
#else #else
@ -139,10 +172,10 @@
blockC03 += blockAxB03; \ blockC03 += blockAxB03; \
} \ } \
} else { \ } else { \
blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \ blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
if (!ALPHA1) { \ if (!ALPHA1) { \
blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \ blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \ blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
@ -172,6 +205,43 @@
intel_sub_group_shuffle( _block.s7, _col ) ); intel_sub_group_shuffle( _block.s7, _col ) );
// A's column block multiply B 's row block. // A's column block multiply B 's row block.
#if TYPE == TYPE_HALF
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
_result = mad( (Dtype8)(_blockB00.s0), acol0, _result ); \
_result = mad( (Dtype8)(_blockB00.s1), acol1, _result ); \
_result = mad( (Dtype8)(_blockB00.s2), acol2, _result ); \
_result = mad( (Dtype8)(_blockB00.s3), acol3, _result ); \
_result = mad( (Dtype8)(_blockB00.s4), acol4, _result ); \
_result = mad( (Dtype8)(_blockB00.s5), acol5, _result ); \
_result = mad( (Dtype8)(_blockB00.s6), acol6, _result ); \
_result = mad( (Dtype8)(_blockB00.s7), acol7, _result ); \
_result = mad( (Dtype8)(_blockB01.s0), acol8, _result ); \
_result = mad( (Dtype8)(_blockB01.s1), acol9, _result ); \
_result = mad( (Dtype8)(_blockB01.s2), acola, _result ); \
_result = mad( (Dtype8)(_blockB01.s3), acolb, _result ); \
_result = mad( (Dtype8)(_blockB01.s4), acolc, _result ); \
_result = mad( (Dtype8)(_blockB01.s5), acold, _result ); \
_result = mad( (Dtype8)(_blockB01.s6), acole, _result ); \
_result = mad( (Dtype8)(_blockB01.s7), acolf, _result ); \
}
#else
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \ { \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
@ -191,7 +261,50 @@
_result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \ _result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \
_result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \ _result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \
} }
#endif
#if TYPE == TYPE_HALF
#define GEMM_NN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( 0, group_y * TILE_M ); \
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
do \
{ \
int2 coordBTemp = coordB; \
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
Dtype8 blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
} \
while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_NN(ALPHA1, BETA_NOT0) \ #define GEMM_NN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
while( coordB.y < width0 ); \ while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
GEMM_NN(1, 0) // ALPHA == 1, BETA == 0 GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
GEMM_NN(1, 1) // ALPHA == 1, BETA != 0 GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \ _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \
} }
#if TYPE == TYPE_HALF
#define GEMM_TN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0);\
const int group_y = get_group_id(1);\
Dtype8 blockAxB00 = 0;\
Dtype8 blockAxB01 = 0;\
Dtype8 blockAxB02 = 0;\
Dtype8 blockAxB03 = 0;\
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
do\
{\
int2 coordBTemp = coordB;\
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
int2 coordATemp = coordA;\
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
} \
while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_TN(ALPHA1, BETA_NOT0) \ #define GEMM_TN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
while( coordB.y < width0 ); \ while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
GEMM_TN(1, 0) // ALPHA == 1, BETA == 0 GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
GEMM_TN(1, 1) // ALPHA == 1, BETA != 0 GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
@ -324,6 +478,43 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
intel_sub_group_shuffle( _block.s6, _col), \ intel_sub_group_shuffle( _block.s6, _col), \
intel_sub_group_shuffle( _block.s7, _col) ) intel_sub_group_shuffle( _block.s7, _col) )
#if TYPE == TYPE_HALF
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
_result = mad( (Dtype8)_blockB.s8, acol8, _result ); \
_result = mad( (Dtype8)_blockB.s9, acol9, _result ); \
_result = mad( (Dtype8)_blockB.sa, acola, _result ); \
_result = mad( (Dtype8)_blockB.sb, acolb, _result ); \
_result = mad( (Dtype8)_blockB.sc, acolc, _result ); \
_result = mad( (Dtype8)_blockB.sd, acold, _result ); \
_result = mad( (Dtype8)_blockB.se, acole, _result ); \
_result = mad( (Dtype8)_blockB.sf, acolf, _result ); \
}
#else
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \ { \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
@ -343,7 +534,51 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \ _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
} }
#endif
#if TYPE == TYPE_HALF
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( 0, group_y * TILE_M ); \
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
do \
{ \
Dtype16 blockB00; \
BLOCKB_READ8(blockB00, B, coordB); \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
} \
while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ #define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
while( coordB.x < padded_k / VECSIZE ); \ while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2; _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
#endif
#define MATB_PARAMETER __read_only image2d_t B #define MATB_PARAMETER __read_only image2d_t B
@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
_coordB.x += TILE_K * 2;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
_blockb = vload8(0, B_read); \ _blockb = vload8(0, B_read); \
_coordB.x += TILE_K; _coordB.x += TILE_K;
#endif
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
Dtype4 temp; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s0 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s1 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s2 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s3 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s5 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s6 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s7 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s8 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s9 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sa = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sb = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sc = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sd = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.se = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sf = temp.s0; \
_coordB.x += 16;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s7 = temp.s0; \ _blockb.s7 = temp.s0; \
_coordB.x += 8; _coordB.x += 8;
#endif
#define MATB_PARAMETER __read_only image2d_t B #define MATB_PARAMETER __read_only image2d_t B
@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
} }
#if TYPE == TYPE_HALF
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
do \
{ \
Dtype8 blockB00; \
BLOCKB_READ8(blockB00, B, coordB); \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
} \
while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
}
#else
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ #define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
while( coordB.x < padded_k / VECSIZE ); \ while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\ GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
} }
#endif
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
_coordB.x += TILE_K;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
_blockb = vload8(0, B_read); \ _blockb = vload8(0, B_read); \
_coordB.x += TILE_K; _coordB.x += TILE_K;
#endif
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -611,7 +957,11 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx]; Dtype srcA = A_off[gidy * ldA + gidx];
#if TYPE == TYPE_HALF
write_imageh(ImA, coord_dst, (Dtype4)srcA);
#else
write_imagef(ImA, coord_dst, (Dtype4)srcA); write_imagef(ImA, coord_dst, (Dtype4)srcA);
#endif
} }
__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)( __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) {
write_imageh(ImA, coord_dst, 0);
return;
}
__global Dtype* A_off = A + offA;
write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
#else
if (gidx >= width || gidy >= height) { if (gidx >= width || gidy >= height) {
write_imageui(ImA, coord_dst, (uint4)0); write_imageui(ImA, coord_dst, (uint4)0);
return; return;
@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx])); uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
write_imageui(ImA, coord_dst, srcA); write_imageui(ImA, coord_dst, srcA);
#endif
} }

View File

@ -40,16 +40,20 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x, __kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
const int offx, __global Dtype* y, const int offx, __global Dtype* y,
const int offy) { const int offy) {
for (int index = get_global_id(0); index < n; index += get_global_size(0)) { for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype src = x[offx + index]; Dtype src = x[offx + index];
Dtype dst = y[offy + index]; Dtype dst = y[offy + index];
y[offy + index] = alpha * src + dst; y[offy + index] = convert_Dtype(alpha) * src + dst;
} }
} }

View File

@ -39,41 +39,45 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(matvec_mul4,Dtype)( __kernel void TEMPLATE(matvec_mul4,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float4 * result, __global Dtype4* result,
int offr, int offr,
__local float4 * work) __local Dtype4* work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + row_gid * 4 * A_col_size + offA; const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
const __global float *src1_read = v + offv; const __global Dtype *src1_read = v + offv;
result = (__global float4*)((__global float*)result + offr); result = (__global Dtype4*)((__global Dtype*)result + offr);
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
float4 dot1 = (float4)(0.f); Dtype4 dot1 = (Dtype4)(0.f);
float4 dot2 = (float4)(0.f); Dtype4 dot2 = (Dtype4)(0.f);
float4 dot3 = (float4)(0.f); Dtype4 dot3 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) { while( i < A_col_size / 4) {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 a1 = vload4(i, src0_read + A_col_size); const Dtype4 a1 = vload4(i, src0_read + A_col_size);
const float4 a2 = vload4(i, src0_read + 2 * A_col_size); const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
const float4 a3 = vload4(i, src0_read + 3 * A_col_size); const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
dot1 += a1 * b0; dot1 += a1 * b0;
@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float at1 = src0_trail[i + A_col_size]; const Dtype at1 = src0_trail[i + A_col_size];
const float at2 = src0_trail[i + 2 * A_col_size]; const Dtype at2 = src0_trail[i + 2 * A_col_size];
const float at3 = src0_trail[i + 3 * A_col_size]; const Dtype at3 = src0_trail[i + 3 * A_col_size];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid].s0 += at0 * bt; work[lid].s0 += at0 * bt;
work[lid].s1 += at1 * bt; work[lid].s1 += at1 * bt;
@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
} }
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) if(beta == (Dtype)0)
result[row_gid] = alpha * work[0]; result[row_gid] = convert_Dtype(alpha) * work[0];
else else
result[row_gid] = alpha * work[0] + beta * result[row_gid]; result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
} }
} }
/* This kernel used for the trailing rows when row_of_A %4 !=0 */ /* This kernel used for the trailing rows when row_of_A %4 !=0 */
__kernel void TEMPLATE(matvec_mul1,Dtype)( __kernel void TEMPLATE(matvec_mul1,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int row_offset, unsigned int row_offset,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float * result, __global Dtype * result,
int offr, int offr,
__local float * work) __local Dtype * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA; const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
const __global float *src1_read = v + + offv; const __global Dtype *src1_read = v + + offv;
result = result + offr; result = result + offr;
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
i += get_local_size(0); i += get_local_size(0);
@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid] += at0 * bt; work[lid] += at0 * bt;
} }
@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) { if(beta == (Dtype)0) {
result[row_gid+row_offset] = alpha * work[0]; result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
} else { } else {
result[row_gid+row_offset] *= beta; result[row_gid+row_offset] *= convert_Dtype(beta);
result[row_gid+row_offset] += alpha * work[0]; result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
} }
} }
} }

View File

@ -40,6 +40,10 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define Dtype float #define Dtype float
#define Dtype4 float4 #define Dtype4 float4
#define Dtype8 float8 #define Dtype8 float8
@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
store(dst_vec, dst, index); store(dst_vec, dst, index);
} }
__kernel void MEAN_FUSE(__global const Dtype * A, __kernel void MEAN_FUSE(__global const T * A,
unsigned int A_col_size, unsigned int A_col_size,
float alpha, float alpha,
__global Dtype4 * result, __global T4 * mean,
__global Dtype * B, __global Dtype * tmp,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; const __global T *src0_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
const Dtype4 b0 = (Dtype4)1.f; const Dtype4 b0 = (Dtype4)1.f;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 += a0; dot0 += convert_float4(a0);
dot1 += a1; dot1 += convert_float4(a1);
dot2 += a2; dot2 += convert_float4(a2);
dot3 += a3; dot3 += convert_float4(a3);
i += get_local_size(0); i += get_local_size(0);
} }
@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
if(lid == 0) if(lid == 0)
{ {
result[row_gid] = alpha * work[0]; mean[row_gid] = convert_T(alpha * work[0]);
} }
Dtype4 sum = work[0] * alpha; Dtype4 sum = work[0] * alpha;
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = native_powr(a0 - (Dtype4)sum.x, 2); dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
dot1 = native_powr(a1 - (Dtype4)sum.y, 2); dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
dot2 = native_powr(a2 - (Dtype4)sum.z, 2); dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
dot3 = native_powr(a3 - (Dtype4)sum.w, 2); dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);
@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
} }
__kernel void MVN_FUSE(__global const Dtype * tmp, __kernel void MVN_FUSE(__global const Dtype * tmp,
__global const Dtype * A, __global const T * A,
__global const Dtype4 * mean, __global const T4 * mean,
unsigned int A_col_size, unsigned int A_col_size,
const float alpha_val, const float alpha_val,
const float eps, const float eps,
const float relu_slope, const float relu_slope,
__global const Dtype4 * bnorm_weight, __global const Dtype4 * bnorm_weight,
__global const Dtype4 * bnorm_bias, __global const Dtype4 * bnorm_bias,
__global Dtype * B, __global T * B,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size; const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
const __global Dtype *src1_read = A + row_gid * 4 * A_col_size; const __global T *src1_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global T *dst0_read = B + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
Dtype4 mean_val = mean[row_gid]; Dtype4 mean_val = convert_float4(mean[row_gid]);
Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps; Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
Dtype4 alpha = (Dtype4)1.f / dev_val; Dtype4 alpha = (Dtype4)1.f / dev_val;
@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src1_read); const T4 a0 = vload4(i, src1_read);
const Dtype4 a1 = vload4(i, src1_read + A_col_size); const T4 a1 = vload4(i, src1_read + A_col_size);
const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size); const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size); const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x; dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y; dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z; dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w; dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
dot0 = dot0 * w.x + (Dtype4)b.x; dot0 = dot0 * w.x + (Dtype4)b.x;
dot1 = dot1 * w.y + (Dtype4)b.y; dot1 = dot1 * w.y + (Dtype4)b.y;
@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
dot3 = select(new3, dot3, dot3 > (Dtype4)0.f); dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(convert_T(dot0), i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(convert_T(dot1), i, dst0_read + A_col_size);
vstore4(dot2, i, dst0_read + 2 * A_col_size); vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
vstore4(dot3, i, dst0_read + 3 * A_col_size); vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
i += get_local_size(0); i += get_local_size(0);
} }

View File

@ -42,14 +42,18 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in, __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
const int num, const int channels, const int num, const int channels,
const int height, const int width, const int size, const int height, const int width, const int size,
const Dtype alpha_over_size, const Dtype k, const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
__global Dtype* const out, __global Dtype* const out,
const Dtype negative_beta) { const KERNEL_ARG_DTYPE negative_beta) {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0)) { index += get_global_size(0)) {
// find out the local offset // find out the local offset
@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
const int step = height * width; const int step = height * width;
__global const Dtype* in_off = in + offset; __global const Dtype* in_off = in + offset;
__global Dtype* out_off = out + offset; __global Dtype* out_off = out + offset;
Dtype scale_val; KERNEL_ARG_DTYPE scale_val;
int head = 0; int head = 0;
const int pre_pad = (size - 1) / 2; const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1; const int post_pad = size - pre_pad - 1;
Dtype accum_scale = 0; KERNEL_ARG_DTYPE accum_scale = 0;
// fill the scale at [n, :, h, w] // fill the scale at [n, :, h, w]
// accumulate values // accumulate values
while (head < post_pad && head < channels) { while (head < post_pad && head < channels) {
@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
// subtract only // subtract only
@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
} }

View File

@ -42,7 +42,10 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#if defined KERNEL_MAX_POOL #if defined KERNEL_MAX_POOL

View File

@ -40,7 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void permute(const int nthreads, __kernel void permute(const int nthreads,
__global Dtype* bottom_data, __global Dtype* bottom_data,

View File

@ -39,17 +39,18 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void prior_box(const int nthreads, __kernel void prior_box(const int nthreads,
const Dtype stepX, const float stepX,
const Dtype stepY, const float stepY,
__global const Dtype* _offsetsX, __global const float* _offsetsX,
__global const Dtype* _offsetsY, __global const float* _offsetsY,
const int offsetsX_size, const int offsetsX_size,
__global const Dtype* _widths, __global const float* _widths,
__global const Dtype* _heights, __global const float* _heights,
const int widths_size, const int widths_size,
__global Dtype* dst, __global Dtype* dst,
const int _layerHeight, const int _layerHeight,
@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
outputPtr = dst + index * 4 * offsetsX_size * widths_size; outputPtr = dst + index * 4 * offsetsX_size * widths_size;
Dtype _boxWidth, _boxHeight; float _boxWidth, _boxHeight;
Dtype4 vec; Dtype4 vec;
for (int i = 0; i < widths_size; ++i) for (int i = 0; i < widths_size; ++i)
{ {
@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
_boxHeight = _heights[i]; _boxHeight = _heights[i];
for (int j = 0; j < offsetsX_size; ++j) for (int j = 0; j < offsetsX_size; ++j)
{ {
float center_x = (w + _offsetsX[j]) * stepX; Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
float center_y = (h + _offsetsY[j]) * stepY; Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
__kernel void set_variance(const int nthreads, __kernel void set_variance(const int nthreads,
const int offset, const int offset,
const int variance_size, const int variance_size,
__global const Dtype* variance, __global const float* variance,
__global Dtype* dst) __global Dtype* dst)
{ {
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
if (variance_size == 1) if (variance_size == 1)
var_vec = (Dtype4)(variance[0]); var_vec = (Dtype4)(variance[0]);
else else
var_vec = vload4(0, variance); var_vec = convert_T(vload4(0, variance));
vstore4(var_vec, 0, dst + offset + index * 4); vstore4(var_vec, 0, dst + offset + index * 4);
} }

View File

@ -39,6 +39,10 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void reorg(const int count, __kernel void reorg(const int count,
__global const Dtype* src, __global const Dtype* src,
const int channels, const int channels,

View File

@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void slice(__global const Dtype* src, __kernel void slice(__global const Dtype* src,
const int src_plane_size, const int src_plane_size,

View File

@ -24,6 +24,10 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void kernel_channel_max(const int num, const int channels, __kernel void kernel_channel_max(const int num, const int channels,
const int spatial_dim, __global const T* data, __global T* out) { const int spatial_dim, __global const T* data, __global T* out) {
int index = get_global_id(0); int index = get_global_id(0);
@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
__kernel void kernel_channel_subtract(const int count, __kernel void kernel_channel_subtract(const int count,
const int num, const int channels, const int num, const int channels,
const int spatial_dim, __global const T* channel_max, __global T* data) { const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) { if(index < count) {
int n = index / channels / spatial_dim; int n = index / channels / spatial_dim;
int s = index % spatial_dim; int s = index % spatial_dim;
data[index] -= channel_max[n * spatial_dim + s]; data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
} }
} }

View File

@ -42,12 +42,15 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_intel_subgroups) #if defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable #pragma OPENCL EXTENSION cl_intel_subgroups : enable
#endif #endif
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels, __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
const int spatial_dim, const int spatial_dim,
__global Dtype* scale, __global Dtype* scale,
@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int n = get_global_id(1); int n = get_global_id(1);
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = maxval / 100000; scale_tmp[s] = maxval;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out_tmp[c * spatial_dim + s]; sum += out_tmp[c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = sum / 100000; scale_tmp[s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = maxval / 100000; scale[n * spatial_dim + s] = maxval;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out[n * channels * spatial_dim + c * spatial_dim + s]; sum += out[n * channels * spatial_dim + c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = sum / 100000; scale[n * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);

View File

@ -64,6 +64,7 @@
namespace cv { namespace dnn { namespace cv { namespace dnn {
CV__DNN_EXPERIMENTAL_NS_BEGIN CV__DNN_EXPERIMENTAL_NS_BEGIN
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
Mutex& getInitializationMutex(); Mutex& getInitializationMutex();
void initializeLayerFactory(); void initializeLayerFactory();
CV__DNN_EXPERIMENTAL_NS_END CV__DNN_EXPERIMENTAL_NS_END

View File

@ -538,6 +538,37 @@ public:
} }
}; };
// In case of resizing by factor.
class ResizeBilinearSubgraph : public Subgraph
{
public:
ResizeBilinearSubgraph()
{
int input = addNodeToMatch("");
int shape = addNodeToMatch("Shape", input);
int stack = addNodeToMatch("Const");
int stack_1 = addNodeToMatch("Const");
int stack_2 = addNodeToMatch("Const");
int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
int factorY = addNodeToMatch("Const");
int mul = addNodeToMatch("Mul", strided_slice, factorY);
shape = addNodeToMatch("Shape", input);
stack = addNodeToMatch("Const");
stack_1 = addNodeToMatch("Const");
stack_2 = addNodeToMatch("Const");
strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
int factorX = addNodeToMatch("Const");
int mul_1 = addNodeToMatch("Mul", strided_slice, factorX);
int pack = addNodeToMatch("Pack", mul, mul_1);
addNodeToMatch("ResizeBilinear", input, pack);
setFusedNode("ResizeBilinear", input, factorY, factorX);
}
};
void simplifySubgraphs(tensorflow::GraphDef& net) void simplifySubgraphs(tensorflow::GraphDef& net)
{ {
std::vector<Ptr<Subgraph> > subgraphs; std::vector<Ptr<Subgraph> > subgraphs;
@ -551,6 +582,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph())); subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph())); subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph())); subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));
subgraphs.push_back(Ptr<Subgraph>(new ResizeBilinearSubgraph()));
int numNodes = net.node_size(); int numNodes = net.node_size();
std::vector<int> matchedNodesIds; std::vector<int> matchedNodesIds;

View File

@ -767,6 +767,26 @@ void TFImporter::populateNet(Net dstNet)
} }
} }
} }
else if (type == "Sub")
{
bool haveConst = false;
for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
{
Pin input = parsePin(layer.input(ii));
haveConst = value_id.find(input.name) != value_id.end();
}
CV_Assert(haveConst);
layerParams.blobs.resize(1);
blobFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]);
layerParams.blobs[0] *= -1;
int id = dstNet.addLayer(name, "Shift", layerParams);
layer_id[name] = id;
// one input only
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
}
else if (type == "MatMul") else if (type == "MatMul")
{ {
CV_Assert(layer.input_size() == 2); CV_Assert(layer.input_size() == 2);

View File

@ -147,7 +147,9 @@ TEST_P(DNNTestNetwork, Inception_5h)
TEST_P(DNNTestNetwork, ENet) TEST_P(DNNTestNetwork, ENet)
{ {
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException(""); if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution", processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" : target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
"dnn/halide_scheduler_enet.yml", "dnn/halide_scheduler_enet.yml",
@ -161,9 +163,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.0007 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.011 : 0.0;
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow) TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
@ -173,15 +177,17 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.06 : 0.0;
processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt", processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, SSD_VGG16) TEST_P(DNNTestNetwork, SSD_VGG16)
{ {
if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL || if ((backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU || (backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU) ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU))
throw SkipTestException(""); throw SkipTestException("");
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
"dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out"); "dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
@ -236,14 +242,17 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.07 : 0.0;
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, DenseNet_121) TEST_P(DNNTestNetwork, DenseNet_121)
{ {
if (backend == DNN_BACKEND_HALIDE || if ((backend == DNN_BACKEND_HALIDE) ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException(""); throw SkipTestException("");
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe"); processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
} }
@ -258,7 +267,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
#endif #endif
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL) tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
}; };
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases)); INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));

View File

@ -104,7 +104,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
ASSERT_FALSE(net.empty()); ASSERT_FALSE(net.empty());
} }
net.setPreferableTarget(get<1>(GetParam())); int targetId = get<1>(GetParam());
const float l1 = 1e-5;
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
net.setPreferableTarget(targetId);
Mat sample = imread(_tf("grace_hopper_227.png")); Mat sample = imread(_tf("grace_hopper_227.png"));
ASSERT_TRUE(!sample.empty()); ASSERT_TRUE(!sample.empty());
@ -112,10 +116,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data"); net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data");
Mat out = net.forward("prob"); Mat out = net.forward("prob");
Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy")); Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
normAssert(ref, out); normAssert(ref, out, "", l1, lInf);
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(), availableDnnTargets())); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(),
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)));
#if !defined(_WIN32) || defined(_WIN64) #if !defined(_WIN32) || defined(_WIN64)
TEST(Reproducibility_FCN, Accuracy) TEST(Reproducibility_FCN, Accuracy)
@ -176,8 +181,11 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false); const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false); const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
Net net = readNetFromCaffe(proto, model); Net net = readNetFromCaffe(proto, model);
int targetId = GetParam();
const float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 1.5e-4 : 1e-5;
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-4;
net.setPreferableTarget(GetParam()); net.setPreferableTarget(targetId);
Mat sample = imread(_tf("street.png")); Mat sample = imread(_tf("street.png"));
@ -185,8 +193,10 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
net.setInput(inp); net.setInput(inp);
Mat out = net.forward(); Mat out = net.forward();
const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 5e-3 : 1e-4;
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy")); Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
normAssertDetections(ref, out); normAssertDetections(ref, out, "", 0.0, scores_diff, boxes_iou_diff);
// Check that detections aren't preserved. // Check that detections aren't preserved.
inp.setTo(0.0f); inp.setTo(0.0f);
@ -212,10 +222,12 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
// a single sample in batch. The first numbers of detection vectors are batch id. // a single sample in batch. The first numbers of detection vectors are batch id.
outBatch = outBatch.reshape(1, outBatch.total() / 7); outBatch = outBatch.reshape(1, outBatch.total() / 7);
EXPECT_EQ(outBatch.rows, 2 * numDetections); EXPECT_EQ(outBatch.rows, 2 * numDetections);
normAssert(outBatch.rowRange(0, numDetections), ref); normAssert(outBatch.rowRange(0, numDetections), ref, "", l1, lInf);
normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7)); normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7),
"", l1, lInf);
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50; typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
TEST_P(Reproducibility_ResNet50, Accuracy) TEST_P(Reproducibility_ResNet50, Accuracy)
@ -226,6 +238,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
int targetId = GetParam(); int targetId = GetParam();
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false); Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
ASSERT_TRUE(!input.empty()); ASSERT_TRUE(!input.empty());
@ -233,20 +248,21 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
Mat out = net.forward(); Mat out = net.forward();
Mat ref = blobFromNPY(_tf("resnet50_prob.npy")); Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
normAssert(ref, out); normAssert(ref, out, "", l1, lInf);
if (targetId == DNN_TARGET_OPENCL) if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
{ {
UMat out_umat; UMat out_umat;
net.forward(out_umat); net.forward(out_umat);
normAssert(ref, out_umat, "out_umat"); normAssert(ref, out_umat, "out_umat", l1, lInf);
std::vector<UMat> out_umats; std::vector<UMat> out_umats;
net.forward(out_umats); net.forward(out_umats);
normAssert(ref, out_umats[0], "out_umat_vector"); normAssert(ref, out_umats[0], "out_umat_vector", l1, lInf);
} }
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1; typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy) TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)

View File

@ -295,26 +295,32 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_fp16;
TEST_P(Test_TensorFlow_fp16, tests)
{
int targetId = GetParam();
const float l1 = 7e-4;
const float lInf = 1e-2;
runTensorFlowNet("fp16_single_conv", targetId, false, l1, lInf);
runTensorFlowNet("fp16_deconvolution", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_same", targetId, false, l1, lInf);
runTensorFlowNet("fp16_padding_valid", targetId, false, l1, lInf);
runTensorFlowNet("fp16_eltwise_add_mul", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_valid", targetId, false, l1, lInf);
runTensorFlowNet("fp16_pad_and_concat", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_even", targetId, false, l1, lInf);
runTensorFlowNet("fp16_padding_same", targetId, false, l1, lInf);
}
INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_fp16,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
TEST(Test_TensorFlow, defun) TEST(Test_TensorFlow, defun)
{ {
runTensorFlowNet("defun_dropout"); runTensorFlowNet("defun_dropout");
} }
TEST(Test_TensorFlow, fp16)
{
const float l1 = 1e-3;
const float lInf = 1e-2;
runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf);
}
TEST(Test_TensorFlow, quantized) TEST(Test_TensorFlow, quantized)
{ {
runTensorFlowNet("uint8_single_conv"); runTensorFlowNet("uint8_single_conv");
@ -373,10 +379,25 @@ public:
ResizeBilinearLayer(const LayerParams &params) : Layer(params) ResizeBilinearLayer(const LayerParams &params) : Layer(params)
{ {
CV_Assert(!params.get<bool>("align_corners", false)); CV_Assert(!params.get<bool>("align_corners", false));
CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1); CV_Assert(!blobs.empty());
for (size_t i = 0; i < blobs.size(); ++i)
CV_Assert(blobs[i].type() == CV_32SC1);
if (blobs.size() == 1)
{
CV_Assert(blobs[0].total() == 2);
outHeight = blobs[0].at<int>(0, 0); outHeight = blobs[0].at<int>(0, 0);
outWidth = blobs[0].at<int>(0, 1); outWidth = blobs[0].at<int>(0, 1);
} }
else
{
CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1);
factorHeight = blobs[0].at<int>(0, 0);
factorWidth = blobs[1].at<int>(0, 0);
outHeight = outWidth = 0;
}
}
static Ptr<Layer> create(LayerParams& params) static Ptr<Layer> create(LayerParams& params)
{ {
@ -391,12 +412,21 @@ public:
std::vector<int> outShape(4); std::vector<int> outShape(4);
outShape[0] = inputs[0][0]; // batch size outShape[0] = inputs[0][0]; // batch size
outShape[1] = inputs[0][1]; // number of channels outShape[1] = inputs[0][1]; // number of channels
outShape[2] = outHeight; outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
outShape[3] = outWidth; outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
outputs.assign(1, outShape); outputs.assign(1, outShape);
return false; return false;
} }
virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
{
if (!outWidth && !outHeight)
{
outHeight = outputs[0].size[2];
outWidth = outputs[0].size[3];
}
}
// This implementation is based on a reference implementation from // This implementation is based on a reference implementation from
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
@ -447,13 +477,51 @@ private:
return x + size[3] * (y + size[2] * (c + size[1] * b)); return x + size[3] * (y + size[2] * (c + size[1] * b));
} }
int outWidth, outHeight; int outWidth, outHeight, factorWidth, factorHeight;
}; };
TEST(Test_TensorFlow, resize_bilinear) TEST(Test_TensorFlow, resize_bilinear)
{ {
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer); CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
runTensorFlowNet("resize_bilinear"); runTensorFlowNet("resize_bilinear");
runTensorFlowNet("resize_bilinear_factor");
LayerFactory::unregisterLayer("ResizeBilinear");
}
// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
// inp = inp[:,:,[2, 1, 0]].astype(np.float32).reshape(1, 512, 512, 3)
// outs = sess.run([sess.graph.get_tensor_by_name('feature_fusion/Conv_7/Sigmoid:0'),
// sess.graph.get_tensor_by_name('feature_fusion/concat_3:0')],
// feed_dict={'input_images:0': inp})
// scores = np.ascontiguousarray(outs[0].transpose(0, 3, 1, 2))
// geometry = np.ascontiguousarray(outs[1].transpose(0, 3, 1, 2))
// np.save('east_text_detection.scores.npy', scores)
// np.save('east_text_detection.geometry.npy', geometry)
TEST(Test_TensorFlow, EAST_text_detection)
{
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
std::string refGeometryPath = findDataFile("dnn/east_text_detection.geometry.npy", false);
Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
Mat img = imread(imgPath);
Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
net.setInput(inp);
std::vector<Mat> outs;
std::vector<String> outNames(2);
outNames[0] = "feature_fusion/Conv_7/Sigmoid";
outNames[1] = "feature_fusion/concat_3";
net.forward(outs, outNames);
Mat scores = outs[0];
Mat geometry = outs[1];
normAssert(scores, blobFromNPY(refScoresPath), "scores");
normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 5e-5, 1e-3);
LayerFactory::unregisterLayer("ResizeBilinear"); LayerFactory::unregisterLayer("ResizeBilinear");
} }

View File

@ -219,13 +219,15 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
} }
} }
// Get rid of dupes // Get rid of dupes and order points.
for( int i = 0; i < (int)intersection.size()-1; i++ ) for( int i = 0; i < (int)intersection.size()-1; i++ )
{ {
float dx1 = intersection[i + 1].x - intersection[i].x;
float dy1 = intersection[i + 1].y - intersection[i].y;
for( size_t j = i+1; j < intersection.size(); j++ ) for( size_t j = i+1; j < intersection.size(); j++ )
{ {
float dx = intersection[i].x - intersection[j].x; float dx = intersection[j].x - intersection[i].x;
float dy = intersection[i].y - intersection[j].y; float dy = intersection[j].y - intersection[i].y;
double d2 = dx*dx + dy*dy; // can be a really small number, need double here double d2 = dx*dx + dy*dy; // can be a really small number, need double here
if( d2 < samePointEps*samePointEps ) if( d2 < samePointEps*samePointEps )
@ -235,6 +237,12 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
intersection.pop_back(); intersection.pop_back();
j--; // restart check j--; // restart check
} }
else if (dx1 * dy - dy1 * dx < 0)
{
std::swap(intersection[i + 1], intersection[j]);
dx1 = dx;
dy1 = dy;
}
} }
} }

View File

@ -66,8 +66,27 @@ private:
void test7(); void test7();
void test8(); void test8();
void test9(); void test9();
void test10();
void test11();
void test12();
void test13();
void test14();
}; };
static void compare(const std::vector<Point2f>& test, const std::vector<Point2f>& target)
{
ASSERT_EQ(test.size(), target.size());
ASSERT_TRUE(test.size() < 4 || isContourConvex(test));
ASSERT_TRUE(target.size() < 4 || isContourConvex(target));
for( size_t i = 0; i < test.size(); i++ )
{
double dx = test[i].x - target[i].x;
double dy = test[i].y - target[i].y;
double r = sqrt(dx*dx + dy*dy);
ASSERT_LT(r, ACCURACY);
}
}
void CV_RotatedRectangleIntersectionTest::run(int) void CV_RotatedRectangleIntersectionTest::run(int)
{ {
// See pics/intersection.png for the scenarios we are testing // See pics/intersection.png for the scenarios we are testing
@ -92,28 +111,20 @@ void CV_RotatedRectangleIntersectionTest::run(int)
test7(); test7();
test8(); test8();
test9(); test9();
test10();
test11();
test12();
test13();
test14();
} }
void CV_RotatedRectangleIntersectionTest::test1() void CV_RotatedRectangleIntersectionTest::test1()
{ {
// no intersection // no intersection
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 12.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(10, 10), Size2f(2, 2), 34.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 12.0f;
rect2.center.x = 10;
rect2.center.y = 10;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 34.0f;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_NONE); CV_Assert(ret == INTERSECT_NONE);
@ -123,375 +134,243 @@ void CV_RotatedRectangleIntersectionTest::test1()
void CV_RotatedRectangleIntersectionTest::test2() void CV_RotatedRectangleIntersectionTest::test2()
{ {
// partial intersection, rectangles translated // partial intersection, rectangles translated
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(1, 1), Size2f(2, 2), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 1;
rect2.center.y = 1;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 4);
vector<Point2f> possibleVertices(4); vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(1.0f, 0.0f);
possibleVertices[0] = Point2f(0.0f, 0.0f); targetVertices[1] = Point2f(1.0f, 1.0f);
possibleVertices[1] = Point2f(1.0f, 1.0f); targetVertices[2] = Point2f(0.0f, 1.0f);
possibleVertices[2] = Point2f(0.0f, 1.0f); targetVertices[3] = Point2f(0.0f, 0.0f);
possibleVertices[3] = Point2f(1.0f, 0.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test3() void CV_RotatedRectangleIntersectionTest::test3()
{ {
// partial intersection, rectangles rotated 45 degree on the corner, forms a triangle intersection // partial intersection, rectangles rotated 45 degree on the corner, forms a triangle intersection
RotatedRect rect1, rect2; RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect2(Point2f(1, 1), Size2f(sqrt(2.0f), 20), 45.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 1;
rect2.center.y = 1;
rect2.size.width = sqrt(2.0f);
rect2.size.height = 20;
rect2.angle = 45.0f;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 3);
vector<Point2f> possibleVertices(3); vector<Point2f> targetVertices(3);
targetVertices[0] = Point2f(1.0f, 0.0f);
possibleVertices[0] = Point2f(1.0f, 1.0f); targetVertices[1] = Point2f(1.0f, 1.0f);
possibleVertices[1] = Point2f(0.0f, 1.0f); targetVertices[2] = Point2f(0.0f, 1.0f);
possibleVertices[2] = Point2f(1.0f, 0.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test4() void CV_RotatedRectangleIntersectionTest::test4()
{ {
// full intersection, rectangles of same size directly on top of each other // full intersection, rectangles of same size directly on top of each other
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 0;
rect2.center.y = 0;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_FULL); CV_Assert(ret == INTERSECT_FULL);
CV_Assert(vertices.size() == 4);
vector<Point2f> possibleVertices(4); vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(-1.0f, 1.0f);
possibleVertices[0] = Point2f(-1.0f, 1.0f); targetVertices[1] = Point2f(-1.0f, -1.0f);
possibleVertices[1] = Point2f(1.0f, -1.0f); targetVertices[2] = Point2f(1.0f, -1.0f);
possibleVertices[2] = Point2f(-1.0f, -1.0f); targetVertices[3] = Point2f(1.0f, 1.0f);
possibleVertices[3] = Point2f(1.0f, 1.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test5() void CV_RotatedRectangleIntersectionTest::test5()
{ {
// partial intersection, rectangle on top rotated 45 degrees // partial intersection, rectangle on top rotated 45 degrees
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 45.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 0;
rect2.center.y = 0;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 45.0f;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 8);
vector<Point2f> possibleVertices(8); vector<Point2f> targetVertices(8);
targetVertices[0] = Point2f(-1.0f, -0.414214f);
possibleVertices[0] = Point2f(-1.0f, -0.414214f); targetVertices[1] = Point2f(-0.414214f, -1.0f);
possibleVertices[1] = Point2f(-1.0f, 0.414214f); targetVertices[2] = Point2f(0.414214f, -1.0f);
possibleVertices[2] = Point2f(-0.414214f, -1.0f); targetVertices[3] = Point2f(1.0f, -0.414214f);
possibleVertices[3] = Point2f(0.414214f, -1.0f); targetVertices[4] = Point2f(1.0f, 0.414214f);
possibleVertices[4] = Point2f(1.0f, -0.414214f); targetVertices[5] = Point2f(0.414214f, 1.0f);
possibleVertices[5] = Point2f(1.0f, 0.414214f); targetVertices[6] = Point2f(-0.414214f, 1.0f);
possibleVertices[6] = Point2f(0.414214f, 1.0f); targetVertices[7] = Point2f(-1.0f, 0.414214f);
possibleVertices[7] = Point2f(-0.414214f, 1.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test6() void CV_RotatedRectangleIntersectionTest::test6()
{ {
// 6 - partial intersection, rectangle on top of different size // 6 - partial intersection, rectangle on top of different size
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(0, 0), Size2f(2, 10), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 0;
rect2.center.y = 0;
rect2.size.width = 2;
rect2.size.height = 10;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 4);
vector<Point2f> possibleVertices(4); vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(-1.0f, -1.0f);
possibleVertices[0] = Point2f(1.0f, 1.0f); targetVertices[1] = Point2f(1.0f, -1.0f);
possibleVertices[1] = Point2f(1.0f, -1.0f); targetVertices[2] = Point2f(1.0f, 1.0f);
possibleVertices[2] = Point2f(-1.0f, -1.0f); targetVertices[3] = Point2f(-1.0f, 1.0f);
possibleVertices[3] = Point2f(-1.0f, 1.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test7() void CV_RotatedRectangleIntersectionTest::test7()
{ {
// full intersection, rectangle fully enclosed in the other // full intersection, rectangle fully enclosed in the other
RotatedRect rect1(Point2f(0, 0), Size2f(12.34f, 56.78f), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 12.34f;
rect1.size.height = 56.78f;
rect1.angle = 0;
rect2.center.x = 0;
rect2.center.y = 0;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_FULL); CV_Assert(ret == INTERSECT_FULL);
CV_Assert(vertices.size() == 4);
vector<Point2f> possibleVertices(4); vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(-1.0f, 1.0f);
possibleVertices[0] = Point2f(1.0f, 1.0f); targetVertices[1] = Point2f(-1.0f, -1.0f);
possibleVertices[1] = Point2f(1.0f, -1.0f); targetVertices[2] = Point2f(1.0f, -1.0f);
possibleVertices[2] = Point2f(-1.0f, -1.0f); targetVertices[3] = Point2f(1.0f, 1.0f);
possibleVertices[3] = Point2f(-1.0f, 1.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
}
CV_Assert(bestR < ACCURACY);
}
} }
void CV_RotatedRectangleIntersectionTest::test8() void CV_RotatedRectangleIntersectionTest::test8()
{ {
// full intersection, rectangle fully enclosed in the other // intersection by a single vertex
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(2, 2), Size2f(2, 2), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 2;
rect2.center.y = 2;
rect2.size.width = 2;
rect2.size.height = 2;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 1); compare(vertices, vector<Point2f>(1, Point2f(1.0f, 1.0f)));
double dx = vertices[0].x - 1;
double dy = vertices[0].y - 1;
double r = sqrt(dx*dx + dy*dy);
CV_Assert(r < ACCURACY);
} }
void CV_RotatedRectangleIntersectionTest::test9() void CV_RotatedRectangleIntersectionTest::test9()
{ {
// full intersection, rectangle fully enclosed in the other // full intersection, rectangle fully enclosed in the other
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect1, rect2; RotatedRect rect2(Point2f(2, 0), Size2f(2, 123.45f), 0.0f);
rect1.center.x = 0;
rect1.center.y = 0;
rect1.size.width = 2;
rect1.size.height = 2;
rect1.angle = 0;
rect2.center.x = 2;
rect2.center.y = 0;
rect2.size.width = 2;
rect2.size.height = 123.45f;
rect2.angle = 0;
vector<Point2f> vertices; vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices); int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL); CV_Assert(ret == INTERSECT_PARTIAL);
CV_Assert(vertices.size() == 2);
vector<Point2f> possibleVertices(2); vector<Point2f> targetVertices(2);
targetVertices[0] = Point2f(1.0f, -1.0f);
possibleVertices[0] = Point2f(1.0f, 1.0f); targetVertices[1] = Point2f(1.0f, 1.0f);
possibleVertices[1] = Point2f(1.0f, -1.0f); compare(vertices, targetVertices);
for( size_t i = 0; i < vertices.size(); i++ )
{
double bestR = DBL_MAX;
for( size_t j = 0; j < possibleVertices.size(); j++ )
{
double dx = vertices[i].x - possibleVertices[j].x;
double dy = vertices[i].y - possibleVertices[j].y;
double r = sqrt(dx*dx + dy*dy);
bestR = std::min(bestR, r);
} }
CV_Assert(bestR < ACCURACY); void CV_RotatedRectangleIntersectionTest::test10()
{
// three points of rect2 are inside rect1.
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect2(Point2f(0, 0.5), Size2f(1, 1), 45.0f);
vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL);
vector<Point2f> targetVertices(5);
targetVertices[0] = Point2f(0.207107f, 1.0f);
targetVertices[1] = Point2f(-0.207107f, 1.0f);
targetVertices[2] = Point2f(-0.707107f, 0.5f);
targetVertices[3] = Point2f(0.0f, -0.207107f);
targetVertices[4] = Point2f(0.707107f, 0.5f);
compare(vertices, targetVertices);
}
void CV_RotatedRectangleIntersectionTest::test11()
{
RotatedRect rect1(Point2f(0, 0), Size2f(4, 2), 0.0f);
RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), -45.0f);
vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL);
vector<Point2f> targetVertices(6);
targetVertices[0] = Point2f(-0.414214f, -1.0f);
targetVertices[1] = Point2f(0.414213f, -1.0f);
targetVertices[2] = Point2f(1.41421f, 0.0f);
targetVertices[3] = Point2f(0.414214f, 1.0f);
targetVertices[4] = Point2f(-0.414213f, 1.0f);
targetVertices[5] = Point2f(-1.41421f, 0.0f);
compare(vertices, targetVertices);
}
void CV_RotatedRectangleIntersectionTest::test12()
{
RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
RotatedRect rect2(Point2f(0, 1), Size2f(1, 1), 0.0f);
vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL);
vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(-0.5f, 1.0f);
targetVertices[1] = Point2f(-0.5f, 0.5f);
targetVertices[2] = Point2f(0.5f, 0.5f);
targetVertices[3] = Point2f(0.5f, 1.0f);
compare(vertices, targetVertices);
}
void CV_RotatedRectangleIntersectionTest::test13()
{
RotatedRect rect1(Point2f(0, 0), Size2f(1, 3), 0.0f);
RotatedRect rect2(Point2f(0, 1), Size2f(3, 1), 0.0f);
vector<Point2f> vertices;
int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
CV_Assert(ret == INTERSECT_PARTIAL);
vector<Point2f> targetVertices(4);
targetVertices[0] = Point2f(-0.5f, 0.5f);
targetVertices[1] = Point2f(0.5f, 0.5f);
targetVertices[2] = Point2f(0.5f, 1.5f);
targetVertices[3] = Point2f(-0.5f, 1.5f);
compare(vertices, targetVertices);
}
void CV_RotatedRectangleIntersectionTest::test14()
{
const int kNumTests = 100;
const int kWidth = 5;
const int kHeight = 5;
RotatedRect rects[2];
std::vector<Point2f> inter;
for (int i = 0; i < kNumTests; ++i)
{
for (int j = 0; j < 2; ++j)
{
rects[j].center = Point2f((float)(rand() % kWidth), (float)(rand() % kHeight));
rects[j].size = Size2f(rand() % kWidth + 1.0f, rand() % kHeight + 1.0f);
rects[j].angle = (float)(rand() % 360);
}
rotatedRectangleIntersection(rects[0], rects[1], inter);
ASSERT_TRUE(inter.size() < 4 || isContourConvex(inter));
} }
} }

View File

@ -420,4 +420,18 @@ void CV_ThreshTest::prepare_to_validation( int /*test_case_idx*/ )
TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); } TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); }
BIGDATA_TEST(Imgproc_Threshold, huge)
{
Mat m(65000, 40000, CV_8U);
ASSERT_FALSE(m.isContinuous());
uint64 i, n = (uint64)m.rows*m.cols;
for( i = 0; i < n; i++ )
m.data[i] = (uchar)(i & 255);
cv::threshold(m, m, 127, 255, cv::THRESH_BINARY);
int nz = cv::countNonZero(m); // FIXIT 'int' is not enough here (overflow is possible with other inputs)
ASSERT_EQ((uint64)nz, n / 2);
}
}} // namespace }} // namespace

View File

@ -251,13 +251,15 @@ void Cloning::initVariables(const Mat &destination, const Mat &binaryMask)
//init of the filters used in the dst //init of the filters used in the dst
const int w = destination.cols; const int w = destination.cols;
filter_X.resize(w - 2); filter_X.resize(w - 2);
double scale = CV_PI / (w - 1);
for(int i = 0 ; i < w-2 ; ++i) for(int i = 0 ; i < w-2 ; ++i)
filter_X[i] = 2.0f * std::cos(static_cast<float>(CV_PI) * (i + 1) / (w - 1)); filter_X[i] = 2.0f * (float)std::cos(scale * (i + 1));
const int h = destination.rows; const int h = destination.rows;
filter_Y.resize(h - 2); filter_Y.resize(h - 2);
scale = CV_PI / (h - 1);
for(int j = 0 ; j < h - 2 ; ++j) for(int j = 0 ; j < h - 2 ; ++j)
filter_Y[j] = 2.0f * std::cos(static_cast<float>(CV_PI) * (j + 1) / (h - 1)); filter_Y[j] = 2.0f * (float)std::cos(scale * (j + 1));
} }
void Cloning::computeDerivatives(const Mat& destination, const Mat &patch, const Mat &binaryMask) void Cloning::computeDerivatives(const Mat& destination, const Mat &patch, const Mat &binaryMask)

View File

@ -53,7 +53,7 @@ namespace opencv_test { namespace {
#define SAVE(x) #define SAVE(x)
#endif #endif
static const double numerical_precision = 1000.; static const double numerical_precision = 0.05; // 95% of pixels should have exact values
TEST(Photo_SeamlessClone_normal, regression) TEST(Photo_SeamlessClone_normal, regression)
{ {
@ -82,8 +82,10 @@ TEST(Photo_SeamlessClone_normal, regression)
SAVE(result); SAVE(result);
double error = cvtest::norm(reference, result, NORM_L1); double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(error, numerical_precision); EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
TEST(Photo_SeamlessClone_mixed, regression) TEST(Photo_SeamlessClone_mixed, regression)
@ -113,9 +115,10 @@ TEST(Photo_SeamlessClone_mixed, regression)
Mat reference = imread(reference_path); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(error, numerical_precision); EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
TEST(Photo_SeamlessClone_featureExchange, regression) TEST(Photo_SeamlessClone_featureExchange, regression)
@ -145,9 +148,10 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
Mat reference = imread(reference_path); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(error, numerical_precision); EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
TEST(Photo_SeamlessClone_colorChange, regression) TEST(Photo_SeamlessClone_colorChange, regression)
@ -171,9 +175,10 @@ TEST(Photo_SeamlessClone_colorChange, regression)
Mat reference = imread(reference_path); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(error, numerical_precision); EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
TEST(Photo_SeamlessClone_illuminationChange, regression) TEST(Photo_SeamlessClone_illuminationChange, regression)
@ -195,9 +200,12 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
SAVE(result); SAVE(result);
Mat reference = imread(reference_path); Mat reference = imread(reference_path);
double error = cvtest::norm(reference, result, NORM_L1); ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
EXPECT_LE(error, numerical_precision);
double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
TEST(Photo_SeamlessClone_textureFlattening, regression) TEST(Photo_SeamlessClone_textureFlattening, regression)
@ -221,9 +229,10 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
Mat reference = imread(reference_path); Mat reference = imread(reference_path);
ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
double error = cvtest::norm(reference, result, NORM_L1); double errorINF = cvtest::norm(reference, result, NORM_INF);
EXPECT_LE(error, numerical_precision); EXPECT_LE(errorINF, 1);
double errorL1 = cvtest::norm(reference, result, NORM_L1);
EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
} }
}} // namespace }} // namespace

View File

@ -661,7 +661,7 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
} }
// Set destination Mats to 0 so new image can be blended // Set destination Mats to 0 so new image can be blended
for (size_t i = 0; i < num_bands_ + 1; ++i) for (size_t i = 0; i < (size_t)(num_bands_ + 1); ++i)
{ {
gpu_dst_band_weights_[i].setTo(0); gpu_dst_band_weights_[i].setTo(0);
gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0)); gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));

View File

@ -11,6 +11,7 @@
namespace cvtest { namespace cvtest {
void checkIppStatus(); void checkIppStatus();
extern bool skipUnstableTests; extern bool skipUnstableTests;
extern bool runBigDataTests;
extern int testThreads; extern int testThreads;
} }
@ -43,7 +44,7 @@ extern int testThreads;
#undef TEST #undef TEST
#define TEST(test_case_name, test_name) \ #define TEST_(test_case_name, test_name, BODY_IMPL) \
class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\ class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\
public:\ public:\
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
@ -65,9 +66,37 @@ extern int testThreads;
::testing::Test::TearDownTestCase, \ ::testing::Test::TearDownTestCase, \
new ::testing::internal::TestFactoryImpl<\ new ::testing::internal::TestFactoryImpl<\
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() CV__TEST_BODY_IMPL( #test_case_name "_" #test_name ) \ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body() void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body()
#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL)
#define CV__TEST_BIGDATA_BODY_IMPL(name) \
{ \
if (!cvtest::runBigDataTests) \
{ \
printf("[ SKIP ] BigData tests are disabled\n"); \
return; \
} \
CV__TRACE_APP_FUNCTION_NAME(name); \
try { \
CV__TEST_INIT \
Body(); \
CV__TEST_CLEANUP \
} \
catch (cvtest::SkipTestException& e) \
{ \
printf("[ SKIP ] %s\n", e.what()); \
} \
} \
// Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb)
#if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__)
#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL)
#else
#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL)
#endif
#undef TEST_F #undef TEST_F
#define TEST_F(test_fixture, test_name)\ #define TEST_F(test_fixture, test_name)\
class GTEST_TEST_CLASS_NAME_(test_fixture, test_name) : public test_fixture {\ class GTEST_TEST_CLASS_NAME_(test_fixture, test_name) : public test_fixture {\

View File

@ -699,6 +699,7 @@ void checkIppStatus()
} }
bool skipUnstableTests = false; bool skipUnstableTests = false;
bool runBigDataTests = false;
int testThreads = 0; int testThreads = 0;
void parseCustomOptions(int argc, char **argv) void parseCustomOptions(int argc, char **argv)
@ -708,6 +709,7 @@ void parseCustomOptions(int argc, char **argv)
"{ test_seed |809564 |seed for random numbers generator }" "{ test_seed |809564 |seed for random numbers generator }"
"{ test_threads |-1 |the number of worker threads, if parallel execution is enabled}" "{ test_threads |-1 |the number of worker threads, if parallel execution is enabled}"
"{ skip_unstable |false |skip unstable tests }" "{ skip_unstable |false |skip unstable tests }"
"{ test_bigdata |false |run BigData tests (>=2Gb) }"
"{ h help |false |print help info }"; "{ h help |false |print help info }";
cv::CommandLineParser parser(argc, argv, command_line_keys); cv::CommandLineParser parser(argc, argv, command_line_keys);
@ -730,6 +732,7 @@ void parseCustomOptions(int argc, char **argv)
testThreads = parser.get<int>("test_threads"); testThreads = parser.get<int>("test_threads");
skipUnstableTests = parser.get<bool>("skip_unstable"); skipUnstableTests = parser.get<bool>("skip_unstable");
runBigDataTests = parser.get<bool>("test_bigdata");
} }

View File

@ -297,12 +297,6 @@ CV_IMPL CvCapture * cvCreateFileCaptureWithPreference (const char * filename, in
// bail out to let the user know that it is not available // bail out to let the user know that it is not available
if (apiPreference) break; if (apiPreference) break;
#ifdef HAVE_FFMPEG
case CAP_FFMPEG:
TRY_OPEN(result, cvCreateFileCapture_FFMPEG_proxy (filename))
if (apiPreference) break;
#endif
#if defined HAVE_LIBV4L || defined HAVE_CAMV4L || defined HAVE_CAMV4L2 || defined HAVE_VIDEOIO #if defined HAVE_LIBV4L || defined HAVE_CAMV4L || defined HAVE_CAMV4L2 || defined HAVE_VIDEOIO
case CAP_V4L: case CAP_V4L:
TRY_OPEN(result, cvCreateCameraCapture_V4L(filename)) TRY_OPEN(result, cvCreateCameraCapture_V4L(filename))
@ -383,11 +377,6 @@ static CvVideoWriter* cvCreateVideoWriterWithPreference(const char* filename, in
default: default:
//exit if the specified API is unavaliable //exit if the specified API is unavaliable
if (apiPreference != CAP_ANY) break; if (apiPreference != CAP_ANY) break;
#ifdef HAVE_FFMPEG
case CAP_FFMPEG:
TRY_OPEN(result, cvCreateVideoWriter_FFMPEG_proxy (filename, fourcc, fps, frameSize, is_color))
if (apiPreference != CAP_ANY) break;
#endif
#ifdef HAVE_MSMF #ifdef HAVE_MSMF
case CAP_MSMF: case CAP_MSMF:
TRY_OPEN(result, cvCreateVideoWriter_MSMF(filename, fourcc, fps, frameSize, is_color)) TRY_OPEN(result, cvCreateVideoWriter_MSMF(filename, fourcc, fps, frameSize, is_color))
@ -530,6 +519,14 @@ static Ptr<IVideoCapture> IVideoCapture_create(const String& filename, int apiPr
{ {
bool useAny = (apiPreference == CAP_ANY); bool useAny = (apiPreference == CAP_ANY);
Ptr<IVideoCapture> capture; Ptr<IVideoCapture> capture;
#ifdef HAVE_FFMPEG
if (useAny || apiPreference == CAP_FFMPEG)
{
capture = cvCreateFileCapture_FFMPEG_proxy(filename);
if (capture && capture->isOpened())
return capture;
}
#endif
#ifdef HAVE_GSTREAMER #ifdef HAVE_GSTREAMER
if (useAny || apiPreference == CAP_GSTREAMER) if (useAny || apiPreference == CAP_GSTREAMER)
{ {
@ -576,6 +573,14 @@ static Ptr<IVideoCapture> IVideoCapture_create(const String& filename, int apiPr
static Ptr<IVideoWriter> IVideoWriter_create(const String& filename, int apiPreference, int _fourcc, double fps, Size frameSize, bool isColor) static Ptr<IVideoWriter> IVideoWriter_create(const String& filename, int apiPreference, int _fourcc, double fps, Size frameSize, bool isColor)
{ {
Ptr<IVideoWriter> iwriter; Ptr<IVideoWriter> iwriter;
#ifdef HAVE_FFMPEG
if (apiPreference == CAP_FFMPEG || apiPreference == CAP_ANY)
{
iwriter = cvCreateVideoWriter_FFMPEG_proxy(filename, _fourcc, fps, frameSize, isColor);
if (!iwriter.empty())
return iwriter;
}
#endif
#ifdef HAVE_MFX #ifdef HAVE_MFX
if (apiPreference == CAP_INTEL_MFX || apiPreference == CAP_ANY) if (apiPreference == CAP_INTEL_MFX || apiPreference == CAP_ANY)
{ {

View File

@ -196,11 +196,11 @@ private:
}; };
class CvCapture_FFMPEG_proxy CV_FINAL : class CvCapture_FFMPEG_proxy CV_FINAL : public cv::IVideoCapture
public CvCapture
{ {
public: public:
CvCapture_FFMPEG_proxy() { ffmpegCapture = 0; } CvCapture_FFMPEG_proxy() { ffmpegCapture = 0; }
CvCapture_FFMPEG_proxy(const cv::String& filename) { ffmpegCapture = 0; open(filename); }
virtual ~CvCapture_FFMPEG_proxy() { close(); } virtual ~CvCapture_FFMPEG_proxy() { close(); }
virtual double getProperty(int propId) const CV_OVERRIDE virtual double getProperty(int propId) const CV_OVERRIDE
@ -215,26 +215,25 @@ public:
{ {
return ffmpegCapture ? icvGrabFrame_FFMPEG_p(ffmpegCapture)!=0 : false; return ffmpegCapture ? icvGrabFrame_FFMPEG_p(ffmpegCapture)!=0 : false;
} }
virtual IplImage* retrieveFrame(int) CV_OVERRIDE virtual bool retrieveFrame(int, cv::OutputArray frame) CV_OVERRIDE
{ {
unsigned char* data = 0; unsigned char* data = 0;
int step=0, width=0, height=0, cn=0; int step=0, width=0, height=0, cn=0;
if (!ffmpegCapture || if (!ffmpegCapture ||
!icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn)) !icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn))
return 0; return false;
cvInitImageHeader(&frame, cvSize(width, height), 8, cn); cv::Mat(height, width, CV_MAKETYPE(CV_8U, cn), data, step).copyTo(frame);
cvSetData(&frame, data, step); return true;
return &frame;
} }
virtual bool open( const char* filename ) virtual bool open( const cv::String& filename )
{ {
icvInitFFMPEG::Init(); icvInitFFMPEG::Init();
close(); close();
if( !icvCreateFileCapture_FFMPEG_p ) if( !icvCreateFileCapture_FFMPEG_p )
return false; return false;
ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename ); ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename.c_str() );
return ffmpegCapture != 0; return ffmpegCapture != 0;
} }
virtual void close() virtual void close()
@ -245,44 +244,45 @@ public:
ffmpegCapture = 0; ffmpegCapture = 0;
} }
virtual bool isOpened() const CV_OVERRIDE { return ffmpegCapture != 0; }
virtual int getCaptureDomain() CV_OVERRIDE { return CV_CAP_FFMPEG; }
protected: protected:
void* ffmpegCapture; void* ffmpegCapture;
IplImage frame;
}; };
CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename) cv::Ptr<cv::IVideoCapture> cv::cvCreateFileCapture_FFMPEG_proxy(const cv::String& filename)
{ {
CvCapture_FFMPEG_proxy* result = new CvCapture_FFMPEG_proxy; cv::Ptr<CvCapture_FFMPEG_proxy> capture = cv::makePtr<CvCapture_FFMPEG_proxy>(filename);
if( result->open( filename )) if (capture && capture->isOpened())
return result; return capture;
delete result; return cv::Ptr<cv::IVideoCapture>();
return 0;
} }
class CvVideoWriter_FFMPEG_proxy CV_FINAL : class CvVideoWriter_FFMPEG_proxy CV_FINAL :
public CvVideoWriter public cv::IVideoWriter
{ {
public: public:
CvVideoWriter_FFMPEG_proxy() { ffmpegWriter = 0; } CvVideoWriter_FFMPEG_proxy() { ffmpegWriter = 0; }
CvVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor) { ffmpegWriter = 0; open(filename, fourcc, fps, frameSize, isColor); }
virtual ~CvVideoWriter_FFMPEG_proxy() { close(); } virtual ~CvVideoWriter_FFMPEG_proxy() { close(); }
virtual bool writeFrame( const IplImage* image ) CV_OVERRIDE virtual void write(cv::InputArray image ) CV_OVERRIDE
{ {
if(!ffmpegWriter) if(!ffmpegWriter)
return false; return;
CV_Assert(image->depth == 8); CV_Assert(image.depth() == CV_8U);
return icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image->imageData, icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image.getMat().ptr(), (int)image.step(), image.cols(), image.rows(), image.channels(), 0);
image->widthStep, image->width, image->height, image->nChannels, image->origin) !=0;
} }
virtual bool open( const char* filename, int fourcc, double fps, CvSize frameSize, bool isColor ) virtual bool open( const cv::String& filename, int fourcc, double fps, cv::Size frameSize, bool isColor )
{ {
icvInitFFMPEG::Init(); icvInitFFMPEG::Init();
close(); close();
if( !icvCreateVideoWriter_FFMPEG_p ) if( !icvCreateVideoWriter_FFMPEG_p )
return false; return false;
ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename, fourcc, fps, frameSize.width, frameSize.height, isColor ); ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename.c_str(), fourcc, fps, frameSize.width, frameSize.height, isColor );
return ffmpegWriter != 0; return ffmpegWriter != 0;
} }
@ -294,18 +294,20 @@ public:
ffmpegWriter = 0; ffmpegWriter = 0;
} }
virtual double getProperty(int) const CV_OVERRIDE { return 0; }
virtual bool setProperty(int, double) CV_OVERRIDE { return false; }
virtual bool isOpened() const CV_OVERRIDE { return ffmpegWriter != 0; }
protected: protected:
void* ffmpegWriter; void* ffmpegWriter;
}; };
CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourcc, cv::Ptr<cv::IVideoWriter> cv::cvCreateVideoWriter_FFMPEG_proxy(const cv::String& filename, int fourcc,
double fps, CvSize frameSize, int isColor ) double fps, cv::Size frameSize, int isColor)
{ {
CvVideoWriter_FFMPEG_proxy* result = new CvVideoWriter_FFMPEG_proxy; cv::Ptr<CvVideoWriter_FFMPEG_proxy> writer = cv::makePtr<CvVideoWriter_FFMPEG_proxy>(filename, fourcc, fps, frameSize, isColor != 0);
if (writer && writer->isOpened())
if( result->open( filename, fourcc, fps, frameSize, isColor != 0 )) return writer;
return result; return cv::Ptr<cv::IVideoWriter>();
delete result;
return 0;
} }

Some files were not shown because too many files have changed in this diff Show More