Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-08-06 14:36:36 +08:00 · 2018-05-21 16:20:14 +03:00 · 2018-05-21 16:20:14 +03:00 · db88cd1b25
commit db88cd1b25
parent 4378b4d03d 085b27fc3d
117 changed files with 4451 additions and 1135 deletions
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@ -2,37 +2,37 @@ function(download_ippicv root_var)
  set(${root_var} "" PARENT_SCOPE)

  # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "dfe3162c237af211e98b8960018b564bc209261d")
+  set(IPPICV_COMMIT "bdb7bb85f34a8cb0d35e40a81f58da431aa1557a")
  # Define actual ICV versions
  if(APPLE)
    set(OPENCV_ICV_PLATFORM "macosx")
    set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "c1ebb5dfa5b7f54b0c44e1917805a463")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_intel64_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "3ae52b9be0fe73dd45bc5e9429cd3732")
    else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "49b05a669042753ae75895a445ebd612")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_mac_ia32_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "698660b975b62bee3ef6c5af51e97544")
    endif()
  elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
    set(OPENCV_ICV_PLATFORM "linux")
    set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "4e0352ce96473837b1d671ce87f17359")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_intel64_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "b7cc351267db2d34b9efa1cd22ff0572")
    else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20170822.tgz")
-      set(OPENCV_ICV_HASH "dcdb0ba4b123f240596db1840cd59a76")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_lnx_ia32_general_20180518.tgz")
+      set(OPENCV_ICV_HASH "ea72de74dae3c604eb6348395366e78e")
    endif()
  elseif(WIN32 AND NOT ARM)
    set(OPENCV_ICV_PLATFORM "windows")
    set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20170822.zip")
-      set(OPENCV_ICV_HASH "0421e642bc7ad741a2236d3ec4190bdd")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_win_intel64_general_20180518.zip")
+      set(OPENCV_ICV_HASH "915ff92958089ede8ea532d3c4fe7187")
    else()
-      set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20170822.zip")
-      set(OPENCV_ICV_HASH "8a7680ae352c192de2e2e34936164bd0")
+      set(OPENCV_ICV_NAME "ippicv_2017u3_win_ia32_general_20180518.zip")
+      set(OPENCV_ICV_HASH "928168c2d99ab284047dfcfb7a821d91")
    endif()
  else()
    return()
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@ -417,7 +417,7 @@ set(lib_srcs
    tif_write.c
    tif_zip.c
    tif_stream.cxx
-	snprintf.c
+    snprintf.c
    t4.h
    tif_dir.h
    tif_fax3.h
--- a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
+++ b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md
@ -32,11 +32,11 @@ Unspecified error: Can't create layer "layer_name" of type "MyType" in function
 To import the model correctly you have to derive a class from cv::dnn::Layer with
 the following methods:

-@snippet dnn/custom_layers.cpp A custom layer interface
+@snippet dnn/custom_layers.hpp A custom layer interface

 And register it before the import:

-@snippet dnn/custom_layers.cpp Register a custom layer
+@snippet dnn/custom_layers.hpp Register a custom layer

@note `MyType` is a type of unimplemented layer from the thrown exception.

@ -44,27 +44,27 @@ Let's see what all the methods do:

 - Constructor

-@snippet dnn/custom_layers.cpp MyLayer::MyLayer
+@snippet dnn/custom_layers.hpp MyLayer::MyLayer

 Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable
 weights they will be already stored in the Layer's member cv::dnn::Layer::blobs.

 - A static method `create`

-@snippet dnn/custom_layers.cpp MyLayer::create
+@snippet dnn/custom_layers.hpp MyLayer::create

 This method should create an instance of you layer and return cv::Ptr with it.

 - Output blobs' shape computation

-@snippet dnn/custom_layers.cpp MyLayer::getMemoryShapes
+@snippet dnn/custom_layers.hpp MyLayer::getMemoryShapes

 Returns layer's output shapes depends on input shapes. You may request an extra
 memory using `internals`.

 - Run a layer

-@snippet dnn/custom_layers.cpp MyLayer::forward
+@snippet dnn/custom_layers.hpp MyLayer::forward

 Implement a layer's logic here. Compute outputs for given inputs.

@ -74,7 +74,7 @@ the second invocation of `forward` will has the same data at `outputs` and `inte

 - Optional `finalize` method

-@snippet dnn/custom_layers.cpp MyLayer::finalize
+@snippet dnn/custom_layers.hpp MyLayer::finalize

 The chain of methods are the following: OpenCV deep learning engine calls `create`
 method once then it calls `getMemoryShapes` for an every created layer then you
@ -108,11 +108,11 @@ layer {

 This way our implementation can look like:

-@snippet dnn/custom_layers.cpp InterpLayer
+@snippet dnn/custom_layers.hpp InterpLayer

 Next we need to register a new layer type and try to import the model.

-@snippet dnn/custom_layers.cpp Register InterpLayer
+@snippet dnn/custom_layers.hpp Register InterpLayer

 ## Example: custom layer from TensorFlow
 This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear)
@ -185,11 +185,11 @@ Custom layers import from TensorFlow is designed to put all layer's `attr` into
 cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs.
 In our case resize's output shape will be stored in layer's `blobs[0]`.

-@snippet dnn/custom_layers.cpp ResizeBilinearLayer
+@snippet dnn/custom_layers.hpp ResizeBilinearLayer

 Next we register a layer and try to import the model.

-@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer
+@snippet dnn/custom_layers.hpp Register ResizeBilinearLayer

 ## Define a custom layer in Python
 The following example shows how to customize OpenCV's layers in Python.
--- a/doc/tutorials/highgui/table_of_content_highgui.markdown
+++ b/doc/tutorials/highgui/table_of_content_highgui.markdown
@ -5,6 +5,8 @@ This section contains tutorials about how to use the built-in graphical user int

 -   @subpage tutorial_trackbar

+    *Languages:* C++, Java, Python
+
    *Compatibility:* \> OpenCV 2.0

    *Author:* Ana Huamán
--- a/doc/tutorials/highgui/trackbar/trackbar.markdown
+++ b/doc/tutorials/highgui/trackbar/trackbar.markdown
@ -1,11 +1,11 @@
 Adding a Trackbar to our applications! {#tutorial_trackbar}
 ======================================

-   In the previous tutorials (about *linear blending* and the *brightness and contrast
-    adjustments*) you might have noted that we needed to give some **input** to our programs, such
-    as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal
-   Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (*highgui.hpp*)
-    for you. An example of this is a **Trackbar**
+-   In the previous tutorials (about @ref tutorial_adding_images and the @ref tutorial_basic_linear_transform)
+    you might have noted that we needed to give some **input** to our programs, such
+    as \f$\alpha\f$ and \f$beta\f$. We accomplished that by entering this data using the Terminal.
+-   Well, it is time to use some fancy GUI tools. OpenCV provides some GUI utilities (**highgui** module)
+    for you. An example of this is a **Trackbar**.

    ![](images/Adding_Trackbars_Tutorial_Trackbar.png)

@ -24,26 +24,73 @@ Code

 Let's modify the program made in the tutorial @ref tutorial_adding_images. We will let the user enter the
 \f$\alpha\f$ value by using the Trackbar.
+
+@add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp)
@include cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java)
+@include java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java
+@end_toggle
+
+@add_toggle_python
+This tutorial code's is shown lines below. You can also download it from
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py)
+@include python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py
+@end_toggle

 Explanation
 -----------

 We only analyze the code that is related to Trackbar:

-#  First, we load two images, which are going to be blended.
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
+-  First, we load two images, which are going to be blended.

-#  To create a trackbar, first we have to create the window in which it is going to be located. So:
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp load
+@end_toggle

-#  Now we can create the Trackbar:
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java load
+@end_toggle

-    Note the following:
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py load
+@end_toggle

+-  To create a trackbar, first we have to create the window in which it is going to be located. So:
+
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp window
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java window
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py window
+@end_toggle
+
+-  Now we can create the Trackbar:
+
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp create_trackbar
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java create_trackbar
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py create_trackbar
+@end_toggle
+
+Note the following (C++ code):
    -   Our Trackbar has a label **TrackbarName**
    -   The Trackbar is located in the window named **Linear Blend**
    -   The Trackbar values will be in the range from \f$0\f$ to **alpha_slider_max** (the minimum
@ -51,10 +98,21 @@ We only analyze the code that is related to Trackbar:
    -   The numerical value of Trackbar is stored in **alpha_slider**
    -   Whenever the user moves the Trackbar, the callback function **on_trackbar** is called

-#  Finally, we have to define the callback function **on_trackbar**
-    @snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
+Finally, we have to define the callback function **on_trackbar** for C++ and Python code, using an anonymous inner class listener in Java

-    Note that:
+@add_toggle_cpp
+@snippet cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp on_trackbar
+@end_toggle
+
+@add_toggle_java
+@snippet java/tutorial_code/highgui/trackbar/AddingImagesTrackbar.java on_trackbar
+@end_toggle
+
+@add_toggle_python
+@snippet python/tutorial_code/highgui/trackbar/AddingImagesTrackbar.py on_trackbar
+@end_toggle
+
+Note that (C++ code):
    -   We use the value of **alpha_slider** (integer) to get a double value for **alpha**.
    -   **alpha_slider** is updated each time the trackbar is displaced by the user.
    -   We define *src1*, *src2*, *dist*, *alpha*, *alpha_slider* and *beta* as global variables,
--- a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
+++ b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
@ -11,9 +11,6 @@ In this tutorial you will learn how to:
    -   @ref cv::erode
    -   @ref cv::dilate

-Interesting fact
-----------
-
@note The explanation below belongs to the book **Learning OpenCV** by Bradski and Kaehler.

 Morphological Operations
@ -38,19 +35,14 @@ Morphological Operations
 -   As the kernel \f$B\f$ is scanned over the image, we compute the maximal pixel value overlapped by
    \f$B\f$ and replace the image pixel in the anchor point position with that maximal value. As you can
    deduce, this maximizing operation causes bright regions within an image to "grow" (therefore the
-    name *dilation*). Take the above image as an example. Applying dilation we can get:
+    name *dilation*).
+-   The dilatation operation is: \f$\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
+
+-   Take the above image as an example. Applying dilation we can get:

    ![](images/Morphology_1_Tutorial_Theory_Dilation.png)

-The background (bright) dilates around the black regions of the letter.
-
-To better grasp the idea and avoid possible confusion, in this other example we have inverted the original
-image such as the object in white is now the letter. We have performed two dilatations with a rectangular
-structuring element of size `3x3`.
-
-![Left image: original image inverted, right image: resulting dilatation](images/Morphology_1_Tutorial_Theory_Dilatation_2.png)
-
-The dilatation makes the object in white bigger.
+-   The bright area of the letter dilates around the black regions of the background.

 ### Erosion

@ -58,31 +50,39 @@ The dilatation makes the object in white bigger.
    area of given kernel.
 -   As the kernel \f$B\f$ is scanned over the image, we compute the minimal pixel value overlapped by
    \f$B\f$ and replace the image pixel under the anchor point with that minimal value.
+-   The erosion operation is: \f$\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f$
 -   Analagously to the example for dilation, we can apply the erosion operator to the original image
-    (shown above). You can see in the result below that the bright areas of the image (the
-    background, apparently), get thinner, whereas the dark zones (the "writing") gets bigger.
+    (shown above). You can see in the result below that the bright areas of the image get thinner,
+    whereas the dark zones gets bigger.

    ![](images/Morphology_1_Tutorial_Theory_Erosion.png)

-In similar manner, the corresponding image results by applying erosion operation on the inverted original image (two erosions
-with a rectangular structuring element of size `3x3`):
-
-![Left image: original image inverted, right image: resulting erosion](images/Morphology_1_Tutorial_Theory_Erosion_2.png)
-
-The erosion makes the object in white smaller.
-
 Code
 ----

+@add_toggle_cpp
 This tutorial's code is shown below. You can also download it
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp)
@include samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java)
+@include samples/java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java
+@end_toggle
+
+@add_toggle_python
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py)
+@include samples/python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py
+@end_toggle

 Explanation
 -----------

 -#  Most of the material shown here is trivial (if you have any doubt, please refer to the tutorials in
-    previous sections). Let's check the general structure of the program:
+    previous sections). Let's check the general structure of the C++ program:

    -   Load an image (can be BGR or grayscale)
    -   Create two windows (one for dilation output, the other for erosion)
--- a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilatation_2.png
+++ b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilatation_2.png
--- a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png
+++ b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Dilation.png
--- a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png
+++ b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion.png
--- a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion_2.png
+++ b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Erosion_2.png
--- a/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png
+++ b/doc/tutorials/imgproc/erosion_dilatation/images/Morphology_1_Tutorial_Theory_Original_Image.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_BlackHat.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing_2.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Closing_2.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Gradient.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening_2.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_Opening_2.png
--- a/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png
+++ b/doc/tutorials/imgproc/opening_closing_hats/images/Morphology_2_Tutorial_Theory_TopHat.png
--- a/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
+++ b/doc/tutorials/imgproc/opening_closing_hats/opening_closing_hats.markdown
@ -36,15 +36,10 @@ discuss briefly 5 operations offered by OpenCV:
    foreground)
 -   For instance, check out the example below. The image at the left is the original and the image
    at the right is the result after applying the opening transformation. We can observe that the
-    small spaces in the corners of the letter tend to disappear.
+    small dots have disappeared.

    ![](images/Morphology_2_Tutorial_Theory_Opening.png)

-For the sake of clarity, we have performed the opening operation (`7x7` rectangular structuring element)
-on the same original image but inverted such as the object in white is now the letter.
-
-![Left image: original image inverted, right image: resulting opening](images/Morphology_2_Tutorial_Theory_Opening_2.png)
-
 ### Closing

 -   It is obtained by the dilation of an image followed by an erosion.
@ -55,10 +50,6 @@ on the same original image but inverted such as the object in white is now the l

    ![](images/Morphology_2_Tutorial_Theory_Closing.png)

-On the inverted image, we have performed the closing operation (`7x7` rectangular structuring element):
-
-![Left image: original image inverted, right image: resulting closing](images/Morphology_2_Tutorial_Theory_Closing_2.png)
-
 ### Morphological Gradient

 -   It is the difference between the dilation and the erosion of an image.
@ -88,14 +79,28 @@ On the inverted image, we have performed the closing operation (`7x7` rectangula
 Code
 ----

-This tutorial code's is shown lines below. You can also download it from
+@add_toggle_cpp
+This tutorial's code is shown below. You can also download it
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp)
@include cpp/tutorial_code/ImgProc/Morphology_2.cpp
+@end_toggle
+
+@add_toggle_java
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java)
+@include java/tutorial_code/ImgProc/opening_closing_hats/MorphologyDemo2.java
+@end_toggle
+
+@add_toggle_python
+This tutorial's code is shown below. You can also download it
+[here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py)
+@include python/tutorial_code/imgProc/opening_closing_hats/morphology_2.py
+@end_toggle

 Explanation
 -----------

-#  Let's check the general structure of the program:
+-#  Let's check the general structure of the C++ program:
    -   Load an image
    -   Create a window to display results of the Morphological operations
    -   Create three Trackbars for the user to enter parameters:
@ -139,8 +144,8 @@ Explanation
 Results
 -------

-   After compiling the code above we can execute it giving an image path as an argument. For this
-    tutorial we use as input the image: **baboon.png**:
+-   After compiling the code above we can execute it giving an image path as an argument. Results using
+    the image: **baboon.png**:

    ![](images/Morphology_2_Tutorial_Original_Image.jpg)

--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -305,6 +305,9 @@ public:
    //! returns true if GpuMat data is NULL
    bool empty() const;

+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
    /*! includes several bit-fields:
    - the magic signature
    - continuity flag
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -2084,6 +2084,9 @@ public:
    static MatAllocator* getDefaultAllocator();
    static void setDefaultAllocator(MatAllocator* allocator);

+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
    //! interaction with UMat
    UMatData* u;

@ -2551,6 +2554,9 @@ public:
    //! and the standard allocator
    static MatAllocator* getStdAllocator();

+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
    // black-box container of UMat data
    UMatData* u;

--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -495,24 +495,20 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
    if( _step == AUTO_STEP )
    {
        _step = minstep;
-        flags |= CONTINUOUS_FLAG;
    }
    else
    {
        CV_DbgAssert( _step >= minstep );
-
        if (_step % esz1 != 0)
        {
            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
        }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
    }
    step[0] = _step;
    step[1] = esz;
    datalimit = datastart + _step * rows;
    dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }

 inline
@ -528,7 +524,6 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
    if( _step == AUTO_STEP )
    {
        _step = minstep;
-        flags |= CONTINUOUS_FLAG;
    }
    else
    {
@ -538,14 +533,12 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
        {
            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
        }
-
-        if (_step == minstep || rows == 1)
-            flags |= CONTINUOUS_FLAG;
    }
    step[0] = _step;
    step[1] = esz;
    datalimit = datastart + _step*rows;
    dataend = datalimit - _step + minstep;
+    updateContinuityFlag();
 }

 template<typename _Tp> inline
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -152,7 +152,7 @@ namespace cv { namespace cuda

        inline ~NppStreamHandler()
        {
-            nppSetStream(oldStream);
+            cudaStreamSynchronize(oldStream);
        }

    private:
--- a/modules/core/misc/java/test/MatTest.java
+++ b/modules/core/misc/java/test/MatTest.java
@ -489,7 +489,7 @@ public class MatTest extends OpenCVTestCase {
    public void testIsContinuous() {
        assertTrue(gray0.isContinuous());

-        Mat subMat = gray0.submat(0, 0, gray0.rows() / 2, gray0.cols() / 2);
+        Mat subMat = gray0.submat(0, gray0.rows() / 2, 0, gray0.cols() / 2);
        assertFalse(subMat.isContinuous());
    }

@ -937,7 +937,7 @@ public class MatTest extends OpenCVTestCase {
    }

    public void testSubmatRect() {
-        Mat submat = gray255.submat(new Rect(5, gray255.rows() / 2, 5, gray255.cols() / 2));
+        Mat submat = gray255.submat(new Rect(5, 5, gray255.cols() / 2, gray255.rows() / 2));
        assertTrue(submat.isSubmatrix());
        assertFalse(submat.isContinuous());

--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -46,6 +46,13 @@
 using namespace cv;
 using namespace cv::cuda;

+void cv::cuda::GpuMat::updateContinuityFlag()
+{
+    int sz[] = { rows, cols };
+    size_t steps[] = { step, elemSize() };
+    flags = cv::updateContinuityFlag(flags, 2, sz, steps);
+}
+
 cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
    flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
    step(step_), data((uchar*)data_), refcount(0),
@ -57,7 +64,6 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
    if (step == Mat::AUTO_STEP)
    {
        step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
    }
    else
    {
@ -65,11 +71,10 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
            step = minstep;

        CV_DbgAssert( step >= minstep );
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
    }

    dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
 }

 cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
@ -83,7 +88,6 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
    if (step == Mat::AUTO_STEP)
    {
        step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
    }
    else
    {
@ -91,11 +95,10 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
            step = minstep;

        CV_DbgAssert( step >= minstep );
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
    }

    dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
 }

 cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
@ -127,17 +130,15 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)

        cols = colRange_.size();
        data += colRange_.start*elemSize();
-        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
    }

-    if (rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-
    if (refcount)
        CV_XADD(refcount, 1);

    if (rows <= 0 || cols <= 0)
        rows = cols = 0;
+
+    updateContinuityFlag();
 }

 cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
@ -146,16 +147,19 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
    datastart(m.datastart), dataend(m.dataend),
    allocator(m.allocator)
 {
-    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
    data += roi.x * elemSize();

-    CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows );
+    CV_Assert( 0 <= roi.x && 0 <= roi.width &&
+               roi.x + roi.width <= m.cols &&
+               0 <= roi.y && 0 <= roi.height &&
+               roi.y + roi.height <= m.rows );

    if (refcount)
        CV_XADD(refcount, 1);

    if (rows <= 0 || cols <= 0)
        rows = cols = 0;
+    updateContinuityFlag();
 }

 GpuMat cv::cuda::GpuMat::reshape(int new_cn, int new_rows) const
@ -245,11 +249,7 @@ GpuMat& cv::cuda::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright
    rows = row2 - row1;
    cols = col2 - col1;

-    if (esz * cols == step || rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-    else
-        flags &= ~Mat::CONTINUOUS_FLAG;
-
+    updateContinuityFlag();
    return *this;
 }

--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@ -201,10 +201,13 @@ void cv::cuda::HostMem::create(int rows_, int cols_, int type_)

    if (rows_ > 0 && cols_ > 0)
    {
-        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_;
+        flags = Mat::MAGIC_VAL + type_;
        rows = rows_;
        cols = cols_;
        step = elemSize() * cols;
+        int sz[] = { rows, cols };
+        size_t steps[] = { step, CV_ELEM_SIZE(type_) };
+        flags = updateContinuityFlag(flags, 2, sz, steps);

        if (alloc_type == SHARED)
        {
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -594,10 +594,11 @@ namespace

    StackAllocator::~StackAllocator()
    {
-        cudaStreamSynchronize(stream_);
-
        if (memStack_ != 0)
+        {
+            cudaStreamSynchronize(stream_);
            memStack_->pool->returnMemStack(memStack_);
+        }
    }

    size_t alignUp(size_t what, size_t alignment)
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -262,31 +262,36 @@ void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool auto
    }
 }

-static void updateContinuityFlag(Mat& m)
+int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
 {
    int i, j;
-    for( i = 0; i < m.dims; i++ )
+    for( i = 0; i < dims; i++ )
    {
-        if( m.size[i] > 1 )
+        if( size[i] > 1 )
            break;
    }

-    for( j = m.dims-1; j > i; j-- )
+    uint64 t = (uint64)size[std::min(i, dims-1)]*CV_MAT_CN(flags);
+    for( j = dims-1; j > i; j-- )
    {
-        if( m.step[j]*m.size[j] < m.step[j-1] )
+        t *= size[j];
+        if( step[j]*size[j] < step[j-1] )
            break;
    }

-    uint64 t = (uint64)m.step[0]*m.size[0];
-    if( j <= i && t == (size_t)t )
-        m.flags |= Mat::CONTINUOUS_FLAG;
-    else
-        m.flags &= ~Mat::CONTINUOUS_FLAG;
+    if( j <= i && t == (uint64)(int)t )
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void Mat::updateContinuityFlag()
+{
+    flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
 }

 void finalizeHdr(Mat& m)
 {
-    updateContinuityFlag(m);
+    m.updateContinuityFlag();
    int d = m.dims;
    if( d > 2 )
        m.rows = m.cols = -1;
@ -427,7 +432,6 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
                       && _colRange.end <= m.cols );
            cols = _colRange.size();
            data += _colRange.start*elemSize();
-            flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
            flags |= SUBMATRIX_FLAG;
        }
    }
@ -437,8 +441,7 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
        CV_RETHROW();
    }

-    if( rows == 1 )
-        flags |= CONTINUOUS_FLAG;
+    updateContinuityFlag();

    if( rows <= 0 || cols <= 0 )
    {
@ -455,8 +458,6 @@ Mat::Mat(const Mat& m, const Rect& roi)
    allocator(m.allocator), u(m.u), size(&rows)
 {
    CV_Assert( m.dims <= 2 );
-    flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
-    flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;

    size_t esz = CV_ELEM_SIZE(flags);
    data += roi.x*esz;
@ -468,6 +469,7 @@ Mat::Mat(const Mat& m, const Rect& roi)
        flags |= SUBMATRIX_FLAG;

    step[0] = m.step[0]; step[1] = esz;
+    updateContinuityFlag();

    if( rows <= 0 || cols <= 0 )
    {
@ -522,7 +524,7 @@ Mat::Mat(const Mat& m, const Range* ranges)
            flags |= SUBMATRIX_FLAG;
        }
    }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }

 Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
@ -548,7 +550,7 @@ Mat::Mat(const Mat& m, const std::vector<Range>& ranges)
            flags |= SUBMATRIX_FLAG;
        }
    }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }


@ -575,10 +577,7 @@ Mat Mat::diag(int d) const
    m.size[1] = m.cols = 1;
    m.step[0] += (len > 1 ? esz : 0);

-    if( m.rows > 1 )
-        m.flags &= ~CONTINUOUS_FLAG;
-    else
-        m.flags |= CONTINUOUS_FLAG;
+    m.updateContinuityFlag();

    if( size() != Size(1,1) )
        m.flags |= SUBMATRIX_FLAG;
@ -597,13 +596,6 @@ void Mat::pop_back(size_t nelems)
    {
        size.p[0] -= (int)nelems;
        dataend -= nelems*step.p[0];
-        /*if( size.p[0] <= 1 )
-        {
-            if( dims <= 2 )
-                flags |= CONTINUOUS_FLAG;
-            else
-                updateContinuityFlag(*this);
-        }*/
    }
 }

@ -618,7 +610,10 @@ void Mat::push_back_(const void* elem)
    memcpy(data + r*step.p[0], elem, esz);
    size.p[0] = r + 1;
    dataend += step.p[0];
-    if( esz < step.p[0] )
+    uint64 tsz = size.p[0];
+    for( int i = 1; i < dims; i++ )
+        tsz *= size.p[i];
+    if( esz < step.p[0] || tsz != (uint64)(int)tsz )
        flags &= ~CONTINUOUS_FLAG;
 }

@ -792,10 +787,7 @@ Mat& Mat::adjustROI( int dtop, int dbottom, int dleft, int dright )
    data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
    rows = row2 - row1; cols = col2 - col1;
    size.p[0] = rows; size.p[1] = cols;
-    if( esz*cols == step[0] || rows == 1 )
-        flags |= CONTINUOUS_FLAG;
-    else
-        flags &= ~CONTINUOUS_FLAG;
+    updateContinuityFlag();
    return *this;
 }

--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@ -120,8 +120,8 @@ static Mat iplImageToMat(const IplImage* img, bool copyData)
    }
    m.datalimit = m.datastart + m.step.p[0]*m.rows;
    m.dataend = m.datastart + m.step.p[0]*(m.rows-1) + esz*m.cols;
-    m.flags |= (m.cols*esz == m.step.p[0] || m.rows == 1 ? Mat::CONTINUOUS_FLAG : 0);
    m.step[1] = esz;
+    m.updateContinuityFlag();

    if( copyData )
    {
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -5681,8 +5681,6 @@ namespace cv {
 // three funcs below are implemented in umatrix.cpp
 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
              bool autoSteps = false );
-
-void updateContinuityFlag(UMat& m);
 void finalizeHdr(UMat& m);

 } // namespace cv
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -193,6 +193,7 @@ inline Size getContinuousSize( const Mat& m1, const Mat& m2,

 void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false );
 void finalizeHdr(Mat& m);
+int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step);

 struct NoVec
 {
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -318,32 +318,15 @@ void setSize( UMat& m, int _dims, const int* _sz,
 }


-void updateContinuityFlag(UMat& m)
+void UMat::updateContinuityFlag()
 {
-    int i, j;
-    for( i = 0; i < m.dims; i++ )
-    {
-        if( m.size[i] > 1 )
-            break;
-    }
-
-    for( j = m.dims-1; j > i; j-- )
-    {
-        if( m.step[j]*m.size[j] < m.step[j-1] )
-            break;
-    }
-
-    uint64 total = (uint64)m.step[0]*m.size[0];
-    if( j <= i && total == (size_t)total )
-        m.flags |= UMat::CONTINUOUS_FLAG;
-    else
-        m.flags &= ~UMat::CONTINUOUS_FLAG;
+    flags = cv::updateContinuityFlag(flags, dims, size.p, step.p);
 }


 void finalizeHdr(UMat& m)
 {
-    updateContinuityFlag(m);
+    m.updateContinuityFlag();
    int d = m.dims;
    if( d > 2 )
        m.rows = m.cols = -1;
@ -537,12 +520,10 @@ UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
        CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols );
        cols = _colRange.size();
        offset += _colRange.start*elemSize();
-        flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
        flags |= SUBMATRIX_FLAG;
    }

-    if( rows == 1 )
-        flags |= CONTINUOUS_FLAG;
+    updateContinuityFlag();

    if( rows <= 0 || cols <= 0 )
    {
@ -557,8 +538,6 @@ UMat::UMat(const UMat& m, const Rect& roi)
    allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
 {
    CV_Assert( m.dims <= 2 );
-    flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
-    flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;

    size_t esz = CV_ELEM_SIZE(flags);
    offset += roi.x*esz;
@ -570,6 +549,7 @@ UMat::UMat(const UMat& m, const Rect& roi)
        flags |= SUBMATRIX_FLAG;

    step[0] = m.step[0]; step[1] = esz;
+    updateContinuityFlag();

    if( rows <= 0 || cols <= 0 )
    {
@ -601,7 +581,7 @@ UMat::UMat(const UMat& m, const Range* ranges)
            flags |= SUBMATRIX_FLAG;
        }
    }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }

 UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
@ -626,7 +606,7 @@ UMat::UMat(const UMat& m, const std::vector<Range>& ranges)
            flags |= SUBMATRIX_FLAG;
        }
    }
-    updateContinuityFlag(*this);
+    updateContinuityFlag();
 }

 UMat UMat::diag(int d) const
@ -652,10 +632,7 @@ UMat UMat::diag(int d) const
    m.size[1] = m.cols = 1;
    m.step[0] += (len > 1 ? esz : 0);

-    if( m.rows > 1 )
-        m.flags &= ~CONTINUOUS_FLAG;
-    else
-        m.flags |= CONTINUOUS_FLAG;
+    m.updateContinuityFlag();

    if( size() != Size(1,1) )
        m.flags |= SUBMATRIX_FLAG;
@ -701,10 +678,7 @@ UMat& UMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
    offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
    rows = row2 - row1; cols = col2 - col1;
    size.p[0] = rows; size.p[1] = cols;
-    if( esz*cols == step[0] || rows == 1 )
-        flags |= CONTINUOUS_FLAG;
-    else
-        flags &= ~CONTINUOUS_FLAG;
+    updateContinuityFlag();
    return *this;
 }

--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -522,33 +522,23 @@ protected:

 TEST(Core_InputOutput, misc) { CV_MiscIOTest test; test.safe_run(); }

-/*class CV_BigMatrixIOTest : public cvtest::BaseTest
+#if 0 // 4+ GB of data, 40+ GB of estimated result size, it is very slow
+BIGDATA_TEST(Core_InputOutput, huge)
 {
-public:
-    CV_BigMatrixIOTest() {}
-    ~CV_BigMatrixIOTest() {}
-protected:
-    void run(int)
+    RNG& rng = theRNG();
+    int N = 1000, M = 1200000;
+    std::cout << "Allocating..." << std::endl;
+    Mat mat(M, N, CV_32F);
+    std::cout << "Initializing..." << std::endl;
+    rng.fill(mat, RNG::UNIFORM, 0, 1);
+    std::cout << "Writing..." << std::endl;
    {
-        try
-        {
-            RNG& rng = theRNG();
-            int N = 1000, M = 1200000;
-            Mat mat(M, N, CV_32F);
-            rng.fill(mat, RNG::UNIFORM, 0, 1);
-            FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
-            fs << "mat" << mat;
-            fs.release();
-        }
-        catch(...)
-        {
-            ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
-        }
+        FileStorage fs(cv::tempfile(".xml"), FileStorage::WRITE);
+        fs << "mat" << mat;
+        fs.release();
    }
-};
-
-TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
-*/
+}
+#endif

 TEST(Core_globbing, accuracy)
 {
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -1766,4 +1766,26 @@ TEST(Mat_, template_based_ptr)
    ASSERT_FLOAT_EQ(66.0f, *(mat.ptr<float>(idx)));
 }

+
+BIGDATA_TEST(Mat, push_back_regression_4158)  // memory usage: ~10.6 Gb
+{
+    Mat result;
+
+    Mat tail(100, 500000, CV_32FC2, Scalar(1, 2));
+
+    tail.copyTo(result);
+    for (int i = 1; i < 15; i++)
+    {
+        result.push_back(tail);
+        std::cout << "i = " << i << "  result = " << result.size() << "   used = " << (uint64)result.total()*result.elemSize()*(1.0 / (1 << 20)) << " Mb"
+            << "   allocated=" << (uint64)(result.datalimit - result.datastart)*(1.0 / (1 << 20)) << " Mb" << std::endl;
+    }
+    for (int i = 0; i < 15; i++)
+    {
+        Rect roi(0, tail.rows * i, tail.cols, tail.rows);
+        int nz = countNonZero(result(roi).reshape(1) == 2);
+        EXPECT_EQ(tail.total(), (size_t)nz) << "i=" << i;
+    }
+}
+
 }} // namespace
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@ -137,12 +137,11 @@ void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");

-    GpuMat src = getInputMat(_src, stream);
+    const GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.type() == CV_8UC1 );

-    _dst.create(1, 2, CV_64FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);

    NppiSize sz;
    sz.width  = src.cols;
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -826,6 +826,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
                               CV_OUT std::vector<int>& indices,
                               const float eta = 1.f, const int top_k = 0);

+    CV_EXPORTS void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
+                             const float score_threshold, const float nms_threshold,
+                             CV_OUT std::vector<int>& indices,
+                             const float eta = 1.f, const int top_k = 0);

 //! @}
 CV__DNN_EXPERIMENTAL_NS_END
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -121,7 +121,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)

 PERF_TEST_P_(DNNTestNetwork, ENet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
    processNet("dnn/Enet-model-best.net", "", "enet.yml",
            Mat(cv::Size(512, 256), CV_32FC3));
 }
@ -232,7 +234,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
 };

 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -62,6 +62,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 // this option is useful to run valgrind memory errors detection
 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);

+#ifdef HAVE_OPENCL
+static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
+#endif
+
 using std::vector;
 using std::map;
 using std::make_pair;
@ -497,7 +501,7 @@ public:
        }
    }

-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate, bool use_half)
    {
        if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate)
        {
@ -538,14 +542,14 @@ public:
        {
            // if dst already has been allocated with total(shape) elements,
            // it won't be recrreated and pointer of dst.data remains the same.
-            dst.create(shape, CV_32F);
+            dst.create(shape, use_half ? CV_16S : CV_32F);
            addHost(lp, dst);
        }
    }

    void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
                               std::vector<LayerPin>& pinsForInternalBlobs,
-                               bool forceCreate = false)
+                               bool forceCreate = false, bool use_half = false)
    {
        CV_TRACE_FUNCTION();

@ -616,7 +620,7 @@ public:
                        reuse(ld.inputBlobsId[0], blobPin);
                    }
                    else
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate, use_half);
                }
            }
        }
@ -654,7 +658,7 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
    {
        if (targetId == DNN_TARGET_CPU)
            return Ptr<BackendWrapper>();
-        else if (targetId == DNN_TARGET_OPENCL)
+        else if (IS_DNN_OPENCL_TARGET(targetId))
            return OpenCLBackendWrapper::create(m);
        else
            CV_Error(Error::StsNotImplemented, "Unknown target identifier");
@ -719,6 +723,7 @@ struct Net::Impl
    bool netWasAllocated;
    bool fusion;
    std::vector<int64> layersTimings;
+    Mat output_blob;

    Ptr<BackendWrapper> wrap(Mat& host)
    {
@ -735,7 +740,7 @@ struct Net::Impl
            Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
            if (preferableBackend == DNN_BACKEND_DEFAULT)
            {
-                CV_Assert(preferableTarget == DNN_TARGET_OPENCL);
+                CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
                return OpenCLBackendWrapper::create(baseBuffer, host);
            }
            else if (preferableBackend == DNN_BACKEND_HALIDE)
@ -847,12 +852,22 @@ struct Net::Impl

        if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
        {
+            if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
 #ifndef HAVE_OPENCL
-            if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
            {
-                CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.")
+                CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
                preferableTarget = DNN_TARGET_CPU;
            }
+#else
+            {
+                if (!DNN_OPENCL_ALLOW_ALL_DEVICES
+                    && !(ocl::Device::getDefault().isIntel() && ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU) // Current implementation is only valid for Intel GPU (#11494)
+                    )
+                {
+                    CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with Intel GPUs only), switching to CPU.");
+                    preferableTarget = DNN_TARGET_CPU;
+                }
+            }
 #endif
            clear();

@ -1022,7 +1037,7 @@ struct Net::Impl
    {
        CV_TRACE_FUNCTION();
        if (preferableBackend == DNN_BACKEND_DEFAULT)
-            CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
+            CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
        else if (preferableBackend == DNN_BACKEND_HALIDE)
            initHalideBackend();
        else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
@ -1357,7 +1372,9 @@ struct Net::Impl

        std::vector<LayerPin> pinsForInternalBlobs;
        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
-                                          preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
+                                          preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
+                                          preferableBackend == DNN_BACKEND_DEFAULT &&
+                                          preferableTarget == DNN_TARGET_OPENCL_FP16);
        ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
        for (int i = 0; i < ld.outputBlobs.size(); ++i)
        {
@ -1427,7 +1444,7 @@ struct Net::Impl
            // some other layers.

            // TODO: OpenCL target support more fusion styles.
-            if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL &&
+            if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
                 ld.layerInstance->type != "MVN")) )
                continue;
@ -1466,8 +1483,8 @@ struct Net::Impl
                    continue;  // Go to the next layer.

                // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
-                if ( preferableTarget != DNN_TARGET_OPENCL ||
-                        (preferableTarget == DNN_TARGET_OPENCL &&
+                if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
+                     (IS_DNN_OPENCL_TARGET(preferableTarget) &&
                         nextData &&
                        ((nextData->type == "ReLU") ||
                         (nextData->type == "ChannelsPReLU") ||
@ -1490,7 +1507,7 @@ struct Net::Impl
                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                        ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;

-                        if ( preferableTarget == DNN_TARGET_OPENCL )
+                        if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
                        {
                            if ( !activData->consumers.empty() )
                            {
@ -1502,7 +1519,7 @@ struct Net::Impl
                }

                // fuse convlution layer followed by eltwise + relu
-                if ( preferableTarget == DNN_TARGET_OPENCL )
+                if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
                {
                    Ptr<EltwiseLayer> nextEltwiseLayer;
                    if( nextData )
@ -1715,6 +1732,13 @@ struct Net::Impl
        for(int i = 0; i < layers[0].outputBlobs.size(); i++)
        {
            CV_Assert(layers[0].outputBlobs[i].total());
+            if (layers[0].outputBlobs[i].depth() == CV_32F &&
+                preferableBackend == DNN_BACKEND_DEFAULT &&
+                preferableTarget == DNN_TARGET_OPENCL_FP16)
+            {
+                Mat mat = layers[0].outputBlobs[i].clone();
+                convertFp16(mat, layers[0].outputBlobs[i]);
+            }
            inputShapes.push_back(shape(layers[0].outputBlobs[i]));
        }
        LayersShapesMap layersShapes;
@ -1760,7 +1784,7 @@ struct Net::Impl
        {
            if( !ld.skip )
            {
-                if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL)
+                if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
                    layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@ -1925,7 +1949,14 @@ struct Net::Impl
            // Transfer data to CPU if it's require.
            ld.outputBlobsWrappers[pin.oid]->copyToHost();
        }
-        return ld.outputBlobs[pin.oid];
+
+        if (ld.outputBlobs[pin.oid].depth() == CV_16S)
+        {
+            convertFp16(ld.outputBlobs[pin.oid], output_blob);
+            return output_blob;
+        }
+        else
+            return ld.outputBlobs[pin.oid];
    }

    Mat getBlob(String outputName)
@ -2068,7 +2099,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)

    if (outputBlobs.isUMat())
    {
-        outputBlobs.assign(ld.outputBlobs[pin.oid].getUMat(ACCESS_RW));
+        outputBlobs.assign(impl->getBlob(layerName).getUMat(ACCESS_RW));
    }
    else if (outputBlobs.isMat())
    {
@ -2084,17 +2115,33 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
                ld.outputBlobsWrappers[i]->copyToHost();
            }
        }
-        std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
-        outputvec = ld.outputBlobs;
+        if (ld.outputBlobs[0].depth() == CV_32F)
+        {
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec = ld.outputBlobs;
+        } else {
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec.resize(ld.outputBlobs.size());
+            for (int i = 0; i < outputvec.size(); i++)
+                convertFp16(ld.outputBlobs[i], outputvec[i]);
+        }
    }
    else if (outputBlobs.isUMatVector())
    {
        std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();

        if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
-            impl->preferableTarget == DNN_TARGET_OPENCL)
+            IS_DNN_OPENCL_TARGET(impl->preferableTarget))
        {
-            outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+            if (impl->preferableTarget == DNN_TARGET_OPENCL)
+                outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+            else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
+            {
+                std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+                outputvec.resize(out_vec.size());
+                for (int i = 0; i < out_vec.size(); i++)
+                    convertFp16(out_vec[i], outputvec[i]);
+            }
        }
        else
        {
@ -2182,6 +2229,16 @@ void Net::setPreferableTarget(int targetId)
    if( impl->preferableTarget != targetId )
    {
        impl->preferableTarget = targetId;
+        if (IS_DNN_OPENCL_TARGET(targetId))
+        {
+#ifndef HAVE_OPENCL
+            impl->preferableTarget = DNN_TARGET_CPU;
+#else
+            bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
+            if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
+                impl->preferableTarget = DNN_TARGET_OPENCL;
+#endif
+        }
        impl->netWasAllocated = false;
        impl->clear();
    }
@ -2210,7 +2267,17 @@ void Net::setInput(InputArray blob, const String& name)
    ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
    ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
    MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
-    Mat blob_ = blob.getMat();
+    Mat blob_;
+    if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
+        impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
+    {
+        Mat blob_mat = blob.getMat();
+        convertFp16(blob_mat, blob_);
+    }
+    else
+    {
+        blob_ = blob.getMat();
+    }
    bool oldShape = prevShape == shape(blob_);
    if (oldShape)
    {
@ -2735,6 +2802,43 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG_VALUE(name, "name", name.c_str());

+    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        std::vector<UMat> orig_inputs;
+        std::vector<UMat> orig_outputs;
+        std::vector<UMat> orig_internals;
+
+        inputs_arr.getUMatVector(orig_inputs);
+        outputs_arr.getUMatVector(orig_outputs);
+        internals_arr.getUMatVector(orig_internals);
+
+        inputs.resize(orig_inputs.size());
+        for (size_t i = 0; i < orig_inputs.size(); i++)
+            convertFp16(orig_inputs[i], inputs[i]);
+
+        outputs.resize(orig_outputs.size());
+        for (size_t i = 0; i < orig_outputs.size(); i++)
+            outputs[i].create(shape(orig_outputs[i]), CV_32F);
+
+        internals.resize(orig_internals.size());
+        for (size_t i = 0; i < orig_internals.size(); i++)
+            internals[i].create(shape(orig_internals[i]), CV_32F);
+
+        forward(inputs, outputs, internals);
+
+        for (size_t i = 0; i < outputs.size(); i++)
+            convertFp16(outputs[i], orig_outputs[i]);
+
+        // sync results back
+        outputs_arr.assign(orig_outputs);
+        internals_arr.assign(orig_internals);
+        return;
+    }
+
    std::vector<Mat> inpvec;
    std::vector<Mat> outputs;
    std::vector<Mat> internals;
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -120,12 +120,16 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inputs_.depth() == CV_16S);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);

        CV_Assert(blobs.size() >= 2);
        CV_Assert(inputs.size() == 1);

+        if (use_half && inputs[0].dims == 2)
+            return false;
+
        if (umat_weight.empty())
        {
            umat_weight = weights_.getUMat(ACCESS_READ);
@ -139,6 +143,7 @@ public:
        int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
        int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;

+        String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            if (inpBlob.dims == 2)
@ -154,8 +159,12 @@ public:
                UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
                UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
                int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
-                String buildopt = format("-DNUM=%d", number);
+                String buildopt = format("-DNUM=%d", number) + opts;
                String kname = format("batch_norm%d", number);
+                if (number == 1)
+                    buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
+                else
+                    buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
                ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
                if (kernel.empty())
                    return false;
@ -181,7 +190,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -95,7 +95,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -128,14 +128,14 @@ public:
            for( i = 0; i < ninputs; i++ )
            {
                Mat& inp = *inputs[i];
-                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
                           inp.dims == 4 && inp.size[0] == output.size[0] &&
                           inp.size[2] == output.size[2] &&
                           inp.size[3] == output.size[3] );
                nchannels += inp.size[1];
            }
            CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );

            cc.chptrs.resize(nchannels*batchsz);

@ -186,6 +186,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -199,11 +200,12 @@ public:
        int num_concats = total(shape(inputs[0]), 0, cAxis);
        int offset_concat_axis = 0;
        UMat& outMat = outputs[0];
-        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
+        String kname = format("concat_%s", use_half ? "half" : "float");

        for (size_t i = 0; i < inputs.size(); i++)
        {
-            ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt);
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
            if (kernel.empty())
                return false;

@ -235,7 +237,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -94,7 +94,7 @@ public:
        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);

        const Mat &input = *inputs[0];
-        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
        for (size_t i = 0; i < inputs.size(); i++)
        {
            CV_Assert(inputs[i]->type() == input.type());
@ -288,7 +288,7 @@ public:
        newActiv = true;
        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;

-        if (preferableTarget == DNN_TARGET_OPENCL)
+        if (IS_DNN_OPENCL_TARGET(preferableTarget))
        {
            Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
            if (!activ_power.empty())
@ -842,6 +842,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -860,6 +861,7 @@ public:
            config.dilation = dilation;
            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
            config.bias_term = (hasBias()) ? true : false;
+            config.use_half = use_half;

            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
        }
@ -964,7 +966,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -1360,6 +1362,9 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

+        if (inputs_.depth() == CV_16S)
+            return false;
+
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);
@ -1450,7 +1455,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -307,8 +307,24 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        inps.getUMatVector(inputs);
-        outs.getUMatVector(outputs);
+        bool use_half = (inps.depth() == CV_16S);
+        if (use_half)
+        {
+            std::vector<UMat> orig_inputs;
+            std::vector<UMat> orig_outputs;
+
+            inps.getUMatVector(orig_inputs);
+            outs.getUMatVector(orig_outputs);
+
+            inputs.resize(orig_inputs.size());
+            for (size_t i = 0; i < orig_inputs.size(); i++)
+                convertFp16(orig_inputs[i], inputs[i]);
+        }
+        else
+        {
+            inps.getUMatVector(inputs);
+            outs.getUMatVector(outputs);
+        }

        std::vector<LabelBBox> allDecodedBBoxes;
        std::vector<Mat> allConfidenceScores;
@ -342,7 +358,13 @@ public:
        {
            // Set confidences to zeros.
            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
-            outputs[0](ranges).setTo(0);
+            if (use_half)
+            {
+                std::vector<UMat> orig_outputs;
+                outs.getUMatVector(orig_outputs);
+                orig_outputs[0](ranges).setTo(0);
+            } else
+                outputs[0](ranges).setTo(0);
            return true;
        }
        int outputShape[] = {1, 1, (int)numKept, 7};
@ -360,9 +382,23 @@ public:
            }
            CV_Assert(count == numKept);
        }
-        outputs.clear();
-        outputs.push_back(umat);
-        outs.assign(outputs);
+
+        if (use_half)
+        {
+            UMat half_umat;
+            convertFp16(umat, half_umat);
+
+            std::vector<UMat> orig_outputs;
+            outs.getUMatVector(orig_outputs);
+            orig_outputs.clear();
+            orig_outputs.push_back(half_umat);
+            outs.assign(orig_outputs);
+        } else {
+            outputs.clear();
+            outputs.push_back(umat);
+            outs.assign(outputs);
+        }
+
        return true;
    }
 #endif
@ -372,7 +408,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -176,7 +176,7 @@ public:
    {
        CV_TRACE_FUNCTION();

-        CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   func.applyOCL(inputs_arr, outputs_arr, internals_arr))

@ -223,7 +223,12 @@ public:
 #ifdef HAVE_OPENCL
 static String oclGetTMacro(const UMat &m)
 {
-    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+    String str_name = ocl::typeToStr(m.type());
+
+    if (str_name == "short")
+        str_name = "half";
+
+    return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
 }
 #endif

@ -516,8 +521,28 @@ struct SigmoidFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
    }
 #endif

@ -561,8 +586,28 @@ struct ELUFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
    }
 #endif

@ -604,8 +649,28 @@ struct AbsValFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
    }
 #endif

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -271,6 +271,9 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        if (inputs_.depth() == CV_16S && op != SUM)
+            return false;
+
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);

@ -284,10 +287,15 @@ public:
                    {
                        size_t localsize[] = { 128 };
                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+                        String opts;
+                        if (inputs_.depth() == CV_16S)
+                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
+                        else
+                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";

                        for (int i = 0; i < (inputs.size() - 1); ++i)
                        {
-                            String buildopt = format("-DLOOP=%d", i);
+                            String buildopt = format("-DLOOP=%d", i) + opts;
                            ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
                            int idx = 0;
                            UMat inpMat = (i == 0) ? inputs[0] : UMat();
@ -306,6 +314,9 @@ public:
                    }
                    else
                    {
+                        if (inputs_.depth() == CV_16S)
+                            return false;
+
                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
                        float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
                        UMat mul0, mul1;
@ -343,7 +354,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -140,7 +140,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   outputs_arr.isUMatVector() &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -64,6 +64,7 @@ public:
 #ifdef HAVE_OPENCL
    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
    std::vector<UMat> umat_blobs;
+    std::vector<UMat> half_blobs;
 #endif

    FullyConnectedLayerImpl(const LayerParams& params)
@ -277,6 +278,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -293,6 +295,17 @@ public:
            config.bias_term = bias;
            config.M = outerSize;
            config.K = innerSize;
+            config.use_half = use_half;
+
+            if (use_half)
+            {
+                half_blobs.resize(umat_blobs.size());
+                for (int i = 0; i < umat_blobs.size(); i++)
+                {
+                    if (!umat_blobs[i].empty())
+                        convertFp16(umat_blobs[i], half_blobs[i]);
+                }
+            }

            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
        }
@ -309,13 +322,15 @@ public:
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
            dstMat.setTo(0.0f);

-            if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
+            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
+                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
+                                         dstMat))
            {
                ret = false;
                break;
            }

-            if (bias && (outerSize > 1))
+            if (!use_half && bias && (outerSize > 1))
            {
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
@ -353,7 +368,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -106,6 +106,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -128,6 +129,7 @@ public:
            config.height = inputs[0].size[2];
            config.width = inputs[0].size[3];
            config.norm_by_size = normBySize;
+            config.use_half = use_half;

            lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
        }
@ -146,7 +148,7 @@ public:

        CV_Assert(inputs_arr.total() == outputs_arr.total());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -102,6 +102,9 @@ public:
    {
        UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
        UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        bool use_half = (inputs[0].depth() == CV_16S);
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");

        int splitDim = (acrossChannels) ? 1 : 2;
        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@ -111,12 +114,11 @@ public:
            int newRows = total(shape(inpMat), 0, splitDim);

            MatShape s = shape(newRows, inpMat.total() / newRows);
-            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
-            UMat meanMat = UMat(s[0], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
            float alpha = 1.0f / s[1];

-            String buildopt = "-DNUM=4";
+            String buildopt = "-DNUM=4" + opts;
            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
            size_t localsize[] = { 128 };
            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
@ -167,13 +169,14 @@ public:
        int row_size = total(shape(inputs[0]), 0, splitDim);
        int plane_size = total(shape(inputs[0]), splitDim);
        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
-        {
-            bool ret = fast_forward_ocl(inputs, outputs);
-            return ret;
-        }
+            return fast_forward_ocl(inputs, outputs);
+
+        if (inputs[0].depth() == CV_16S)
+            return false;

        UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
        UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");

        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
        {
@ -195,7 +198,7 @@ public:

            int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
            size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
-            String buildopt = format("-DNUM=%d", number);
+            String buildopt = format("-DNUM=%d", number) + opts;
            if (normVariance)
            {
                String kname = format("calc_mean%d", number);
@ -249,7 +252,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -87,6 +87,9 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

+        if (inputs_.depth() == CV_16S)
+            return false;
+
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);
@ -162,7 +165,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -288,9 +288,11 @@ public:
        if (!_needsPermute)
            return false;

+        bool use_half = (inps.depth() == CV_16S);
+        String opts = format("-DDtype=%s", use_half ? "half" : "float");
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);

            kernel.set(0, (int)_count);
            kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
@ -313,7 +315,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -147,6 +147,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -164,6 +165,7 @@ public:
                                (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                               LIBDNN_POOLING_METHOD_STO);
            config.avePoolPaddedArea = avePoolPaddedArea;
+            config.use_half = use_half;
            poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
        }

@ -189,7 +191,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -316,6 +316,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);

@ -340,9 +341,15 @@ public:
            heights.copyTo(umat_heights);
        }

-        size_t nthreads = _layerHeight * _layerWidth;
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
+
+        size_t nthreads = _layerHeight * _layerWidth;
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);

-        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
        kernel.set(0, (int)nthreads);
        kernel.set(1, (float)_stepX);
        kernel.set(2, (float)_stepY);
@ -375,7 +382,7 @@ public:

        // set the variance.
        {
-            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
            int offset = total(shape(outputs[0]), 2);
            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
            kernel.set(0, (int)nthreads);
@ -395,7 +402,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@ -158,6 +158,9 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

+        if (inputs_.depth() == CV_16S)
+            return false;
+
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);
@ -237,7 +240,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@ -127,7 +127,7 @@ public:
        std::vector<UMat> outputs;

        // TODO: implement a logistic activation to classification scores.
-        if (useLogistic)
+        if (useLogistic || inps.depth() == CV_16S)
            return false;

        inps.getUMatVector(inputs);
@ -191,7 +191,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@ -96,9 +96,10 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
-        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");

        for (size_t i = 0; i < inputs.size(); i++)
        {
@ -134,7 +135,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@ -219,7 +219,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -181,6 +181,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

+        bool use_half = (inputs_.depth() == CV_16S);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);

@ -188,6 +189,11 @@ public:
            (total(shape(outputs[0]), 2) % 4 != 0))
            return false;

+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
        const UMat& inpMat = inputs[0];
        for (size_t i = 0; i < outputs.size(); i++)
        {
@ -196,7 +202,7 @@ public:
            int rows = outputs[i].size[2];
            int cols = outputs[i].size[3];

-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc);
+            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
            size_t local[] = { 128 };
            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
            int idx = 0;
@ -222,7 +228,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -99,15 +99,16 @@ public:
        softmaxOp.release();
    }

-    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns)
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
        std::vector<UMat> internals;

-        inps.getUMatVector(inputs);
-        outs.getUMatVector(outputs);
-        itns.getUMatVector(internals);
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);

        if (softmaxOp.empty())
        {
@ -117,6 +118,7 @@ public:
            config.axis = axisRaw;
            config.channels = inputs[0].size[axisRaw];
            config.logsoftmax = logSoftMax;
+            config.use_half = use_half;

            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
        }
@ -128,15 +130,13 @@ public:
            return true;

        UMat& bufMat = internals[0];
-        src.copyTo(dstMat);
-
        int axis = clamp(axisRaw, src.dims);
        MatShape s = shape(src);
        size_t outerSize = total(s, 0, axis);
        size_t channels = src.size[axis];
        size_t innerSize = total(s, axis + 1);

-        String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+        String buildOpts = format("-DT=%s", use_half ? "half" : "float");
        ocl::Kernel kmax, ksub, ksum, kdiv;

        if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
@ -152,38 +152,31 @@ public:
        if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
            return false;

-        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
        size_t bufSize = internals[0].total();
        size_t totalSize = src.total();

-        // adjust local/global size
-        size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize };
-        size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
-
-        // adjust local/global size (total)
-        size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
-        size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
+        size_t internal_globalSize[1] = { bufSize };
+        size_t total_globalSize[1] = { totalSize };

        kmax.args((int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!kmax.run(1, internal_globalSize, internal_localSize, false))
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!kmax.run(1, internal_globalSize, NULL, false))
            return false;

        ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-        if (!ksub.run(1, total_globalSize, total_localSize, false))
+                  ocl::KernelArg::PtrReadOnly(bufMat),
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
+        if (!ksub.run(1, total_globalSize, NULL, false))
            return false;

-        cv::exp(dstMat, dstMat);
-
        ksum.args((int)outerSize, (int)channels, (int)innerSize,
                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!ksum.run(1, internal_globalSize, internal_localSize, false))
+        if (!ksum.run(1, internal_globalSize, NULL, false))
            return false;

        kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-        if (!kdiv.run(1, total_globalSize, total_localSize, false))
+        if (!kdiv.run(1, total_globalSize, NULL, false))
            return false;

        return true;
@ -195,7 +188,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/nms.cpp
+++ b/modules/dnn/src/nms.cpp
@ -8,6 +8,8 @@
 #include "precomp.hpp"
 #include "nms.inl.hpp"

+#include <opencv2/imgproc.hpp>
+
 namespace cv
 {
 namespace dnn
@ -28,6 +30,27 @@ void NMSBoxes(const std::vector<Rect>& bboxes, const std::vector<float>& scores,
    NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap);
 }

+static inline float rotatedRectIOU(const RotatedRect& a, const RotatedRect& b)
+{
+    std::vector<Point2f> inter;
+    int res = rotatedRectangleIntersection(a, b, inter);
+    if (inter.empty() || res == INTERSECT_NONE)
+        return 0.0f;
+    if (res == INTERSECT_FULL)
+        return 1.0f;
+    float interArea = contourArea(inter);
+    return interArea / (a.size.area() + b.size.area() - interArea);
+}
+
+void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
+              const float score_threshold, const float nms_threshold,
+              std::vector<int>& indices, const float eta, const int top_k)
+{
+    CV_Assert(bboxes.size() == scores.size(), score_threshold >= 0,
+        nms_threshold >= 0, eta > 0);
+    NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rotatedRectIOU);
+}
+
 CV__DNN_EXPERIMENTAL_NS_END
 }// dnn
 }// cv
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
        stride(1, 1),
        dilation(1, 1),
        group(1),
-        bias_term(false)
+        bias_term(false),
+        use_half(false)
    {}
    MatShape in_shape;
    MatShape out_shape;
@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
    Size dilation;
    int group; // = 1;
    bool bias_term; // = false;
+    bool use_half; // = false;
 };

 typedef enum {
@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
        int32_t group_;
        bool bias_term_;
        UMat swizzled_weights_umat;
+        UMat weights_half;
+        UMat bias_half;
        UMat bottom_data2_;

        int32_t bottom_index_;
@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
        ocl4dnnFusedActiv_t fused_activ_;
        float power_;
        bool fused_eltwise_;
+        bool use_half_;
 };

 typedef enum {
@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
        channels(0),
        pool_method(LIBDNN_POOLING_METHOD_MAX),
        global_pooling(false),
-        avePoolPaddedArea(false)
+        avePoolPaddedArea(true),
+        use_half(false)
    {}
    MatShape in_shape;
    MatShape out_shape;
@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
    ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
    bool global_pooling; // = false;
    bool avePoolPaddedArea;
+    bool use_half;
 };

 template<typename Dtype>
@ -391,13 +398,14 @@ class OCL4DNNPool
        int32_t pooled_height_;
        int32_t pooled_width_;
        bool avePoolPaddedArea;
+        bool use_half;
 };

 struct OCL4DNNInnerProductConfig
 {
    OCL4DNNInnerProductConfig() :
        num_output(0), M(0), K(0),
-        bias_term(false), transpose(false), phase_test(true)
+        bias_term(false), transpose(false), phase_test(true), use_half(false)
    {}
    int num_output;
    int M;
@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
    bool bias_term;
    bool transpose; // = false;
    bool phase_test; // = true;
+    bool use_half; // = false;
 };

 template<typename Dtype>
@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
        bool transpose_;
        bool image_copied_;
        bool phase_test_;
+        bool use_half_;
 };

 typedef enum {
@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
        lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
        phase_test(true),
        local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
-        batch_size(0), channels(0), height(0), width(0)
+        batch_size(0), channels(0), height(0), width(0), use_half(false)
    {}
    MatShape in_shape;
    LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
    int32_t channels;
    int32_t height;
    int32_t width;
+    bool use_half;
 };

 template<typename Dtype>
@ -477,16 +488,18 @@ class OCL4DNNLRN
        int32_t height_;
        int32_t width_;
        bool norm_by_size_;
+        bool use_half_;
 };

 struct OCL4DNNSoftmaxConfig
 {
-    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false)
+    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
    {}
    MatShape in_shape;
    int axis;
    int channels;
    bool logsoftmax;
+    bool use_half;
 };

 template<typename Dtype>
@ -506,6 +519,7 @@ class OCL4DNNSoftmax
        bool use_slm_;
        bool log_softmax_;
        UMat scale_data_;
+        bool use_half_;
 };

 }}} // namespace cv::dnn::ocl4dnn
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@ -48,6 +48,12 @@

 namespace cv { namespace dnn { namespace ocl4dnn {

+enum gemm_data_type_t
+{
+    TYPE_FLOAT = 1,
+    TYPE_HALF = 2
+};
+
 // Create and copy buffer to image for GEMM's matrix A and B.
 // Will return image to caller if the input image is NULL. Otherwise,
 // will use the image directly. It's caller's responsibility to
@ -60,6 +66,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
                                          int width, int ld)
 {
    ocl::Image2D image;
+    String opts = format("-DTYPE=%d", TYPE_FLOAT);

    if (!is_matrix_a && transpose)
    {
@ -73,7 +80,8 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
            UMat mat(height, width, CV_32FC1);
            image = ocl::Image2D(mat);

-            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc);
+            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float",
+                                       ocl::dnn::gemm_image_oclsrc, opts);

            size_t global_copy[2];
            global_copy[0] = width;
@ -96,7 +104,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
            image = ocl::Image2D(mat);

            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
-                                       ocl::dnn::gemm_image_oclsrc);
+                                       ocl::dnn::gemm_image_oclsrc, opts);

            size_t global_copy[2];
            global_copy[0] = padded_width;
@ -129,7 +137,7 @@ enum gemm_type_t
    GEMM_TYPE_FAST_IMAGE_32_1,
    GEMM_TYPE_FAST_IMAGE_32_2,
    GEMM_TYPE_FAST_IMAGE_B_IMAGE,
-    GEMM_TYPE_MAX
+    GEMM_TYPE_FAST_BUFFER
 };

 template<typename Dtype>
@ -145,6 +153,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
             gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;

+    bool halfPrecisionMode = (A.depth() == CV_16S);
+
    if (is_image_a)
    {
        CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
@ -157,6 +167,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
        return false;
    }

+    String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
    int widthA = (TransA == CblasNoTrans) ? K : M;
    int heightA = (TransA == CblasNoTrans) ? M : K;
    int widthB = (TransB == CblasNoTrans) ? N : K;
@ -178,7 +189,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
    int blockC_width = blocksize;
    int blockC_height = blocksize;

-    int use_buffer_indicator = 8;
+    int use_buffer_indicator = (halfPrecisionMode) ? 16 : 8;
    // To fix the edge problem caused by the sub group block read.
    // we have to pad the image if it's not multiple of tile.
    // just padding one line is enough as the sub group block read
@ -221,9 +232,13 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
    else
        kernel_name += "1";

-    kernel_name += "_float";
+    if (halfPrecisionMode) {
+        kernel_name += "_half";
+    } else {
+        kernel_name += "_float";
+    }

-    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc);
+    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc, opts);
    if (oclk_gemm_float.empty())
        return false;

@ -255,6 +270,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
                bool padding_A = false;
                bool padding_B = false;

+                if (halfPrecisionMode && is_image_b) {
+                    padding_A = true;
+                }
+
                if (!is_image_a && !is_image_b)
                {
                    if (M * K < N * K)
@ -265,17 +284,19 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,

                if (!is_image_a)
                {
-                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
-                                                              true, TransA != CblasNoTrans,
-                                                              padding_A, imageA_h, imageA_w,
-                                                              blockA_height, blockA_width, ldA);
+                    if (!halfPrecisionMode)
+                        ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                                  true, TransA != CblasNoTrans,
+                                                                  padding_A, imageA_h, imageA_w,
+                                                                  blockA_height, blockA_width, ldA);
                }
                if (!is_image_b)
                {
-                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
-                                                              false, false,
-                                                              padding_B, imageB_h, imageB_w,
-                                                              blockB_height, blockB_width, ldB);
+                    if (!halfPrecisionMode)
+                        ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                                  false, false,
+                                                                  padding_B, imageB_h, imageB_w,
+                                                                  blockB_height, blockB_width, ldB);
                }
            } else {
                // We will use normal read_imagef to read image B when B has transpose.
@ -283,32 +304,48 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
                if (!is_image_a)
                {
                    bool padding;
-                    padding = !is_image_b;
-                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
-                                                              true, TransA != CblasNoTrans,
-                                                              padding, imageA_h, imageA_w,
-                                                              blockA_height, blockA_width, ldA);
+                    padding = !is_image_b || halfPrecisionMode;
+                    if (!halfPrecisionMode)
+                        ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                                  true, TransA != CblasNoTrans,
+                                                                  padding, imageA_h, imageA_w,
+                                                                  blockA_height, blockA_width, ldA);
                }

                if (!is_image_b && (K % use_buffer_indicator != 0))
                {
-                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
-                                                              false, true, false, imageB_h, imageB_w,
-                                                              blockB_height, blockB_width, ldB);
+                    if (!halfPrecisionMode)
+                        ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                                  false, true, false,
+                                                                  imageB_h, imageB_w,
+                                                                  blockB_height, blockB_width, ldB);
                }
            }

            size_t global[2];
            if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
            {
-                global[0] = (size_t)( blockC_width + 7 ) & ~7;
+                if (halfPrecisionMode) {
+                    global[0] = (size_t)( blockC_width + 15 ) & ~15;
+                } else {
+                    global[0] = (size_t)( blockC_width + 7 ) & ~7;
+                }
            } else {
-                global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+                if (halfPrecisionMode) {
+                    global[0] = (size_t)( (blockC_width / 2 ) + 15 ) ^ ~15;
+                } else {
+                    global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+                }
            }
            global[1] = (size_t)(blockC_height + 31) / 32;

            size_t local[2];
-            local[0] = 8;
+            if (halfPrecisionMode)
+            {
+                local[0] = 16;
+            } else {
+                local[0] = 8;
+            }
            local[1] = 1;

            cl_uint arg_idx = 0;
@ -385,6 +422,101 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
    return true;
 }

+template<typename Dtype>
+static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
+                                  const CBLAS_TRANSPOSE TransB, const int32_t M,
+                                  const int32_t N, const int32_t K, const Dtype alpha,
+                                  const UMat A, const int32_t offA, const UMat B,
+                                  const int32_t offB, const Dtype beta, UMat C,
+                                  const int32_t offC, enum gemm_type_t gemm_type)
+{
+    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
+             << "Invalid fast buffer gemm type." << std::endl;
+
+    bool halfPrecisionMode = (A.depth() == CV_16S);
+
+    size_t sub_group_size = 8;
+    bool is_small_batch = (M == 2 || M == 4 || M == 8);
+    String kernel_name("gemm_buffer_");
+    if (TransA == CblasNoTrans && TransB == CblasNoTrans) {
+        kernel_name += "NN";
+        if (halfPrecisionMode) {
+            sub_group_size = 16;
+        }
+    } else if (TransA == CblasNoTrans && TransB != CblasNoTrans) {
+        if (M == 2)
+            kernel_name +="NT_M_2";
+        else if (M == 4)
+            kernel_name +="NT_M_4";
+        else if (M == 8)
+            kernel_name +="NT_M_8";
+        else
+            kernel_name += "NT";
+    }
+
+    if (halfPrecisionMode) {
+        kernel_name += "_half";
+    } else {
+        kernel_name += "_float";
+    }
+
+    String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
+    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
+    size_t local[2] = {};
+    size_t global[2] = {};
+    if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
+        if (M == 8)
+            local[0] = 16;
+        else if (M == 4)
+            local[0] = 32;
+        else
+            local[0] = 64;
+        local[1] = 1;
+
+        if (M == 8)
+            global[0] = N * local[0];
+        else
+            global[0] = (N + 3) / 4 * local[0];
+        global[1] = 1;
+    } else {
+        size_t lx = sub_group_size;
+        size_t ly = (TransB != CblasNoTrans && TransA == CblasNoTrans && halfPrecisionMode) ? 2 : 4;
+        int dx = (TransB != CblasNoTrans && TransA == CblasNoTrans) ? 1 : 4;
+        int dy = 8;
+        size_t gx = (size_t)(N + dx - 1) / dx;
+        size_t gy = (size_t)(M + dy - 1) / dy;
+        global[0] = (gx + lx - 1) / lx * lx;
+        global[1] = (gy + ly - 1) / ly * ly;
+        local[0] = lx;
+        local[1] = ly;
+    }
+
+    int arg_idx = 0;
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
+    oclk_gemm_float.set(arg_idx++, offA);
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+    oclk_gemm_float.set(arg_idx++, offB);
+    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
+    oclk_gemm_float.set(arg_idx++, offC);
+    oclk_gemm_float.set(arg_idx++, M);
+    oclk_gemm_float.set(arg_idx++, N);
+    oclk_gemm_float.set(arg_idx++, K);
+    oclk_gemm_float.set(arg_idx++, (float)alpha);
+    oclk_gemm_float.set(arg_idx++, (float)beta);
+
+    bool ret;
+    if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
+        int stride = 256;
+        for (int start_index = 0; start_index < K; start_index += stride) {
+            oclk_gemm_float.set(arg_idx, start_index);
+            ret = oclk_gemm_float.run(2, global, local, false);
+        }
+    } else {
+        ret = oclk_gemm_float.run(2, global, local, false);
+    }
+    return ret;
+}
+
 template<typename Dtype>
 bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                       const int32_t M, const int32_t N, const int32_t K,
@ -392,7 +524,8 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                       const UMat B_image, UMat C,
                       const size_t max_image_size)
 {
-    gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1;
+    bool halfPrecisionMode = (A.depth() == CV_16S);
+    gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;

    if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
        gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
@ -409,6 +542,11 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                                           GEMM_TYPE_FAST_IMAGE_B_IMAGE,
                                           max_image_size);
    }
+    else if (gemm_type == GEMM_TYPE_FAST_BUFFER)
+    {
+        return ocl4dnnFastBufferGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+                                            1.f, A, 0, B, 0, 0.f, C, 0, gemm_type);
+    }
    return false;
 }

@ -436,10 +574,17 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
                 const int32_t offy)
 {
    bool ret = false;
+    bool use_half = (A.depth() == CV_16S);
+    String opts;
+    if (use_half)
+        opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
+    else
+        opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "float", "float4", "float");

    if (TransA == CblasNoTrans)
    {
-        ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc);
+        String kname = format("matvec_mul4_%s", use_half ? "half" : "float");
+        ocl::Kernel k(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
        if (k.empty())
            return false;

@ -469,7 +614,8 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,

        if ((row_size % 4) != 0 && ret)
        {
-            ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc);
+            String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
+            ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
            size_t localsize[] = { 128 };
            size_t globalsize[] = { row_size % 4 * localsize[0] };
            uint row_offset = row_size - (row_size % 4);
@ -499,7 +645,15 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
                 const UMat X, const int32_t offX, UMat Y,
                 const int32_t offY)
 {
-    ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
+    bool use_half = (X.depth() == CV_16S);
+    String opts;
+    if (use_half)
+        opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
+    else
+        opts = "-DDtype=float -DDtype4=float4 -Dconvert_Dtype=convert_float";
+
+    String kname = format("axpy_%s", use_half ? "half" : "float");
+    ocl::Kernel oclk_axpy(kname.c_str(), cv::ocl::dnn::math_oclsrc, opts);
    if (oclk_axpy.empty())
        return false;

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -54,6 +54,7 @@
 #include "opencl_kernels_dnn.hpp"
 #include "../include/math_functions.hpp"
 #include "../include/default_kernel_config.hpp"
+#include "opencv2/dnn/shape_utils.hpp"

 #if defined WIN32 || defined _WIN32
 #include <windows.h>
@ -85,6 +86,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
    max_value_ = 0;
    prev_kernel_type_ = -1;
    tuned_ = false;
+    use_half_ = config.use_half;

    // assumption: spatial dimension is 2.
    kernel_h_ = config.kernel.height;
@ -204,18 +206,40 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
    return;
 }

+typedef enum {
+    TYPE_FLOAT = 1,
+    TYPE_HALF = 2
+} ocl4dnnConvSpatialType_t;
+
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 {
-    addDef("Dtype", "float");
-    addDef("Dtype2", "float2");
-    addDef("Dtype4", "float4");
-    addDef("Dtype8", "float8");
-    addDef("Dtype16", "float16");
-    addDef("as_Dtype", "as_float");
-    addDef("as_Dtype2", "as_float2");
-    addDef("as_Dtype4", "as_float4");
-    addDef("as_Dtype8", "as_float8");
+    if (use_half_)
+    {
+        addDef("TYPE", TYPE_HALF);
+        addDef("Dtype", "half");
+        addDef("Dtype2", "half2");
+        addDef("Dtype4", "half4");
+        addDef("Dtype8", "half8");
+        addDef("Dtype16", "half16");
+        addDef("as_Dtype", "as_half");
+        addDef("as_Dtype2", "as_half2");
+        addDef("as_Dtype4", "as_half4");
+        addDef("as_Dtype8", "as_half8");
+    }
+    else
+    {
+        addDef("TYPE", TYPE_FLOAT);
+        addDef("Dtype", "float");
+        addDef("Dtype2", "float2");
+        addDef("Dtype4", "float4");
+        addDef("Dtype8", "float8");
+        addDef("Dtype16", "float16");
+        addDef("as_Dtype", "as_float");
+        addDef("as_Dtype2", "as_float2");
+        addDef("as_Dtype4", "as_float4");
+        addDef("as_Dtype8", "as_float8");
+    }
 }

 typedef enum {
@ -477,10 +501,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
        fused_eltwise_ = false;
    }

-    prepareKernel(bottom, top, weight, bias, numImages);
+    if (use_half_ && bias_half.empty() && !bias.empty())
+        convertFp16((UMat&)bias, bias_half);
+
+    if (use_half_ && weights_half.empty())
+        convertFp16((UMat&)weight, weights_half);
+
+    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
    if (bestKernelConfig.empty())
        return false;
-    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
+    return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
 }

 template<typename Dtype>
@ -556,6 +586,12 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
               << "_" << blockWidth
               << "_" << blockHeight
               << "_" << blockDepth;
+
+    if (!use_half_)
+        keyBuilder << "_float";
+    else
+        keyBuilder << "_half";
+
    return keyBuilder.str();
 }

@ -637,9 +673,13 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,

    if (swizzled_weights_umat.empty())
        swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
-                                     kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1);
+                                     kernel_h_ * (int)alignSize(kernel_w_, 2),
+                                     (use_half_) ? CV_16SC1 : CV_32FC1);
+
+    UMat swizzled_weights_tmp;
+    if (use_half_)
+        swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);

-    ocl::Queue queue = ocl::Queue::getDefault();
    if (!interleave) {
        cl_uint argIdx = 0;
        int32_t channels = channels_ / group_;
@ -650,7 +690,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
            return false;

        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
-        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
+        if (use_half_)
+            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
+        else
+            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
        oclk_copy_weight.set(argIdx++, kernel_w_);
        oclk_copy_weight.set(argIdx++, kernel_h_);
        oclk_copy_weight.set(argIdx++, channels);
@ -669,7 +712,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
        // assumption: kernel dimesion is 2
        Mat weightMat = weight.getMat(ACCESS_READ);
        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
-        Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+        Mat swizzledWeightMat;
+        if (use_half_)
+            swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
+        else
+            swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
        Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();

        int interleavedRows = (kernel_w_ / 2) * 2;
@ -694,6 +741,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
                         rowAlignment);
        free(tmpSwizzledWeight);
    }
+
+    if (use_half_)
+        convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
+
    return true;
 }

@ -727,9 +778,10 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
    cl_mem sub_mem;
    cl_buffer_region region;
    cl_int err;
+    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);

-    region.origin = offset * sizeof(float);
-    region.size = size * sizeof(float);
+    region.origin = offset * element_size;
+    region.size = size * element_size;
    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
                                CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
@ -739,8 +791,9 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
        return;
    }

-    int step = sizeof(float), rows = size, cols = 1;
-    ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer);
+    int step = element_size, rows = size, cols = 1;
+    ocl::convertFromBuffer(sub_mem, step, rows, cols,
+                           (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);

    //decrease ocl mem refcount
    clReleaseMemObject(sub_mem);
@ -978,7 +1031,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        cl_uint argIdx = 0;
        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        if (use_half_)
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
+        else
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
        if (bias_term_)
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@ -1018,7 +1074,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                kernel.set(argIdx++, image_offset);
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+                if (use_half_)
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
+                else
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                kernel.set(argIdx++, kernel_offset);
                if (bias_term_)
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1132,14 +1191,27 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
        return false;

    int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
-    top.zeros(4, sz, CV_32FC1);
+    top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
    bool saved_tuned = tuned_;
    tuned_ = false;
    convolve(bottom, top, weight, bias, numImages, config);
    tuned_ = saved_tuned;

-    float *data = (float *)top.getMat(ACCESS_READ).ptr<float>();
-    float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+    UMat new_top, new_verify_top;
+    float *data, *verify_data;
+    if (use_half_)
+    {
+        convertFp16(top, new_top);
+        convertFp16(verifyTop, new_verify_top);
+
+        data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
+        verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
+    }
+    else
+    {
+        data = (float *)top.getMat(ACCESS_READ).ptr<float>();
+        verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+    }

    for (int32_t n = 0; n < num_; ++n) {
        for (int32_t g = 0; g < group_; ++g) {
@ -1148,9 +1220,19 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
                for (int h = 0; h < output_h_ && !verificationFail; h++)
                    for (int w = 0; w < output_w_; w++) {
                        size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
-                        if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
-                            !(fabs(verify_data[offset]) < 1.e-3 &&
-                            fabs(data[offset] - verify_data[offset]) < 1.e-4))
+
+                        float error_factor = fabs(data[offset] - verify_data[offset]);
+                        if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
+                            error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
+                        {
+                            dbgPrint(printf("test verification failed @ image %d group %d"
+                                            "out_ch %d h %d w %d got %G expected %G\n",
+                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                            verificationFail = 1;
+                            goto out;
+                        }
+                        else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
+                                 !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                        {
                            dbgPrint(printf("test verification failed @ image %d group %d"
                                            "out_ch %d h %d w %d got %G expected %G\n",
@ -1719,15 +1801,16 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
    if (loadTunedConfig()) // check external storage
        return;

-    UMat benchData(1, numImages * top_dim_, CV_32FC1);
+    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
+
+    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
+
    if (force_auto_tuning_)
    {
-        calculateBenchmark(bottom, benchData, weight, bias, numImages);
        setupConvolution(bottom, top, weight, bias, numImages, benchData);
    }
    else
    {
-        calculateBenchmark(bottom, benchData, weight, bias, numImages);
        useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
    }
    cacheTunedConfig();
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
    K_ = config.K;
    phase_test_ = config.phase_test;
    image_copied_ = false;
+    use_half_ = config.use_half;
 }

 template<typename Dtype>
@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
        if (M_ <= max_image_size &&
            N_ <= max_image_size &&
            K_ <= max_image_size &&
-            cv::traits::Depth<Dtype>::value == CV_32F &&
            ocl::Device::getDefault().intelSubgroupsSupport())
        {
            ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
                                           M_, N_, K_, bottom, weight, UMat(), top,
                                           max_image_size);
        }
+
+        if (use_half_ && bias_term_)
+        {
+            UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
+            UMat newbias, tmpTop;
+
+            convertFp16(bias, newbias);
+            convertFp16(top, tmpTop);
+            cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
+            convertFp16(tmpTop, top);
+        }
+
        return ret;
    }
 }
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
    channels_ = config.channels;
    height_ = config.height;
    width_ = config.width;
+    use_half_ = config.use_half;
 }

 template<typename Dtype>
@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
    int32_t n_threads = num_ * height_ * width_;
    size_t global_work_size_[1] = {(size_t)n_threads};
    String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+    opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
    ocl::Kernel oclk_lrn_fill;
-    if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
+    String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
+    if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
        return false;

    oclk_lrn_fill.set(argIdx++, n_threads);
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
    channels_ = config.channels;
    pool_method_ = config.pool_method;
    avePoolPaddedArea = config.avePoolPaddedArea;
+    use_half = config.use_half;

    for (int i = 0; i < spatial_dims; ++i)
    {
@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    case LIBDNN_POOLING_METHOD_MAX:
        {
            bool haveMask = !top_mask.empty();
+            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            kname += (use_half) ? "_half" : "_float";
            ocl::Kernel oclk_max_pool_forward(
-                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
        {
            CV_Assert(top_mask.empty());

-            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
+            String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_ave_pool_forward(
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
        {
            CV_Assert(top_mask.empty());

-            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
+            String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_sto_pool_forward(
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
    softmax_axis_ = config.axis;
    channels_ = config.channels;
    log_softmax_ = config.logsoftmax;
+    use_half_ = config.use_half;

    inner_num_ = 1;
    outer_num_ = 1;
@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)

        if (log_softmax_) opts += " -DLOG_SOFTMAX ";
        if (use_slm_)
-            kname = CL_KERNEL_SELECT("softmax_forward_slm");
+            kname = "softmax_forward_slm";
        else
-            kname = CL_KERNEL_SELECT("softmax_forward");
+            kname = "softmax_forward";

+        kname += format("%s", (use_half_) ? "_half" : "_float");
+        opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
+                       (use_half_) ? "HALF_MAX" : "FLT_MAX");
        if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
            return false;

--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@ -40,9 +40,17 @@
 //
 //M*/

+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #ifndef RELU_NO_SLOPE
-, T negative_slope
+, KERNEL_ARG_DTYPE negative_slope
 #endif
 ) {
  int index = get_global_id(0);
@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 }

 __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
-                           const T minValue, const T maxValue)
+                           const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
 {
  int index = get_global_id(0);
  if(index < count)
  {
    T x = in[index];
-    out[index] = clamp(x, minValue, maxValue);
+    out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
  }
 }

 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
-                           __global const T* in, __global T* out, __global const T* slope_data)
+                           __global const T* in, __global T* out,
+                           __global const KERNEL_ARG_DTYPE* slope_data)
 {
  int index = get_global_id(0);
  int c = (index / plane_size) % channels;
@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
    out[index] = fabs(in[index]);
 }

-__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) {
+__kernel void PowForward(const int n, __global const T* in, __global T* out,
+                         const KERNEL_ARG_DTYPE power,
+                         const KERNEL_ARG_DTYPE scale,
+                         const KERNEL_ARG_DTYPE shift)
+{
  int index = get_global_id(0);
  if (index < n)
    out[index] = pow(shift + scale * in[index], power);
 }
+
+__kernel void ELUForward(const int n, __global const T* in, __global T* out)
+{
+  int index = get_global_id(0);
+  if (index < n)
+  {
+    T src = in[index];
+    out[index] = (src >= 0.f) ? src : exp(src) - 1;
+  }
+}
--- a/modules/dnn/src/opencl/batchnorm.cl
+++ b/modules/dnn/src/opencl/batchnorm.cl
@ -40,24 +40,27 @@
 //
 //M*/

-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 #if NUM == 8
    #define load(src, index) vload8(0, src + index)
    #define store(vec, dst, index) vstore8(vec, 0, dst + index)
-    #define vec_type Dtype8
+    #define float_type float8
+    #define convert_f convert_float8
    #define BATCH_NORM batch_norm8
 #elif NUM == 4
    #define load(src, index) vload4(0, src + index)
    #define store(vec, dst, index) vstore4(vec, 0, dst + index)
-    #define vec_type Dtype4
+    #define float_type float4
+    #define convert_f convert_float4
    #define BATCH_NORM batch_norm4
 #elif NUM == 1
    #define load(src, index) src[index]
    #define store(vec, dst, index) dst[index] = vec
-    #define vec_type Dtype
+    #define float_type float
+    #define convert_f convert_float
    #define BATCH_NORM batch_norm1
 #endif

@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
                         const int rows,
                         const int cols,
                         const int channels,
-                         __global const Dtype* weight,
-                         __global const Dtype* bias,
+                         __global const float* weight,
+                         __global const float* bias,
                         __global Dtype* dst)
 {
    int x = get_global_id(0);
@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
    if (x >= rows || y >= cols)
        return;

-    Dtype w = weight[x % channels];
-    Dtype b = bias[x % channels];
-    vec_type src_vec = load(src, index);
-    vec_type dst_vec = src_vec * w + (vec_type)b;
-    store(dst_vec, dst, index);
+    float w = weight[x % channels];
+    float b = bias[x % channels];
+    float_type src_vec = convert_f(load(src, index));
+    float_type dst_vec = src_vec * w + (float_type)b;
+    store(convert_T(dst_vec), dst, index);
 }
--- a/modules/dnn/src/opencl/concat.cl
+++ b/modules/dnn/src/opencl/concat.cl
@ -39,22 +39,29 @@
 //
 //M*/

-__kernel void concat(const int nthreads,
-                     __global const Dtype* in_data,
-                     const int num_concats,
-                     const int concat_size,
-                     const int top_concat_axis,
-                     const int bottom_concat_axis,
-                     const int offset_concat_axis,
-                     __global Dtype* out_data) {
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0)) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index
-        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    out_data[top_index] = in_data[index];
-  }
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
+                                      __global const Dtype* in_data,
+                                      const int num_concats,
+                                      const int concat_size,
+                                      const int top_concat_axis,
+                                      const int bottom_concat_axis,
+                                      const int offset_concat_axis,
+                                      __global Dtype* out_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        const int total_concat_size = concat_size * bottom_concat_axis;
+        const int concat_num = index / total_concat_size;
+        const int concat_index = index % total_concat_size;
+        const int top_index = concat_index +
+                              (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+        out_data[top_index] = in_data[index];
+    }
 }
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -40,27 +40,29 @@
 //
 //M*/

-#if APPLY_BIAS
-#define BIAS_KERNEL_ARG __global Dtype * biases_base,
-#else
-#define BIAS_KERNEL_ARG
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif

+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
+
 #if defined(FUSED_CONV_RELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
-#define FUSED_ARG Dtype negative_slope,
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
+#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
 #elif defined(FUSED_CONV_PRELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
-#define FUSED_ARG __global const Dtype *negative_slope,
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
+#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
 #elif defined(FUSED_CONV_POWER)
-#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
-#define FUSED_ARG Dtype power,
+#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
+#define FUSED_ARG KERNEL_ARG_DTYPE power,
 #elif defined(FUSED_CONV_TANH)
 #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
 #define FUSED_ARG
 #elif defined(FUSED_CONV_RELU6)
-#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
-#define FUSED_ARG Dtype min_value, Dtype max_value,
+#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
+#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
 #else
 #define ACTIVATION_RELU_FUNCTION(x, c) (x)
 #define FUSED_ARG
@ -74,6 +76,11 @@
 #define ELTWISE_DATA_ARG
 #endif

+#if APPLY_BIAS
+#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#else
+#define BIAS_KERNEL_ARG
+#endif

 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
@ -97,6 +104,16 @@
 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))

 #if defined(convolve_simd) || defined(Conv_Interleaved)
+#if TYPE == TYPE_HALF
+#define INT_TYPE ushort
+#define INT_TYPE2 ushort2
+#define INT_TYPE4 ushort4
+#define INT_TYPE8 ushort8
+#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
+#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
+#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
+#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
+#else
 #define INT_TYPE uint
 #define INT_TYPE2 uint2
 #define INT_TYPE4 uint4
@ -106,6 +123,7 @@
 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
 #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
 #endif
+#endif

 #ifdef KERNEL_BASIC

@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
 typedef struct float0 { float s0; } float0; //never used but makes compiler happy.

+typedef struct half1 { half s0; } half1;
+typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
+typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
+typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
+typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
+typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; } half10;
+typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; } half11;
+typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; } half12;
+typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
+typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
+typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
+typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
+
 #define OUT_PITCH_X output_width
 #define ROW_PITCH input_width

--- a/modules/dnn/src/opencl/eltwise.cl
+++ b/modules/dnn/src/opencl/eltwise.cl
@ -40,9 +40,9 @@
 //
 //M*/

-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 __kernel void op_sum4(__global const Dtype * A,
                      __global const Dtype * B,
@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
        a2 = vload4(i, src0_read + 2 * A_col_size);
        a3 = vload4(i, src0_read + 3 * A_col_size);

-        dot0 = a0 * coeff1 + b0 * coeff2;
-        dot1 = a1 * coeff1 + b1 * coeff2;
-        dot2 = a2 * coeff1 + b2 * coeff2;
-        dot3 = a3 * coeff1 + b3 * coeff2;
+        dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
+        dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
+        dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
 #else
        a0 = vload4(i, dst0_read);
        a1 = vload4(i, dst0_read + A_col_size);
        a2 = vload4(i, dst0_read + 2 * A_col_size);
        a3 = vload4(i, dst0_read + 3 * A_col_size);

-        dot0 = a0 + b0 * coeff2;
-        dot1 = a1 + b1 * coeff2;
-        dot2 = a2 + b2 * coeff2;
-        dot3 = a3 + b3 * coeff2;
+        dot0 = a0 + b0 * (Dtype4)coeff2;
+        dot1 = a1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 + b2 * (Dtype4)coeff2;
+        dot3 = a3 + b3 * (Dtype4)coeff2;
 #endif
        vstore4(dot0, i, dst0_read);
        vstore4(dot1, i, dst0_read + A_col_size);
--- a/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
@ -39,24 +39,42 @@
 //
 //M*/

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)

-// Types used for parameters, offset computations and so on
-#define int_tp int
-#define uint_tp unsigned int
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2

+#if TYPE == TYPE_HALF
+#define Dtype  half
+#define Dtype2 half2
+#define Dtype4 half4
+#define Dtype8 half8
+#define Dtype16 half16
+
+#define as_Dtype  as_half
+#define as_Dtype2 as_half2
+#define as_Dtype4 as_half4
+#define as_Dtype8 as_half8
+#define as_Dtype16 as_half16
+#else
 #define Dtype  float
 #define Dtype2 float2
 #define Dtype4 float4
 #define Dtype8 float8
+#define Dtype16 float16

 #define as_Dtype  as_float
 #define as_Dtype2 as_float2
 #define as_Dtype4 as_float4
 #define as_Dtype8 as_float8
-
-#define KERNEL_ARG_DTYPE float
+#define as_Dtype16 as_float16
+#endif

 #if defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
@ -67,6 +85,15 @@

 // common block to calculate (alpha * AxB + beta * C) and output to destination image.

+#if TYPE == TYPE_HALF
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
+#define SHUFFLE_TYPE2(val) as_ushort2(val)
+#define SHUFFLE_TYPE8(val) as_ushort8(val)
+#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(ushort)
+#define SIMD_SIZE_GEMM 16
+#define TILE_N 16
+#else
 #define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
 #define SHUFFLE_TYPE2(val) val
 #define SHUFFLE_TYPE8(val) val
@ -74,11 +101,17 @@
 #define SIZE_OF_ELEMENT sizeof(uint)
 #define SIMD_SIZE_GEMM 8
 #define TILE_N 8
+#endif

 //#define USE_IMAGE_C
 #ifdef USE_IMAGE_C
+#if TYPE == TYPE_HALF
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
+#else
 #define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
 #define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
+#endif
 #define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
 #define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
 #else
@ -139,10 +172,10 @@
            blockC03 += blockAxB03; \
        } \
    } else { \
-        blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
-        blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+        blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
        if (!ALPHA1) { \
          blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
          blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
@ -172,6 +205,43 @@
                  intel_sub_group_shuffle( _block.s7, _col ) );

 // A's column block multiply B 's row block.
+#if TYPE == TYPE_HALF
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
+            _result = mad( (Dtype8)(_blockB00.s0), acol0, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s1), acol1, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s2), acol2, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s3), acol3, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s4), acol4, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s5), acol5, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s6), acol6, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s7), acol7, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s0), acol8, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s1), acol9, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s2), acola, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s3), acolb, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s4), acolc, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s5), acold, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s6), acole, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s7), acolf, _result );      \
+        }
+#else
 #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
        {   \
            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
@ -191,7 +261,50 @@
            _result = mad( (Dtype8)(_blockB.s6), acol6, _result );      \
            _result = mad( (Dtype8)(_blockB.s7), acol7, _result );      \
        }
+#endif

+#if TYPE == TYPE_HALF
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+    do \
+    {  \
+        int2    coordBTemp = coordB; \
+        Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        Dtype8  blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        int2    coordATemp = coordA; \
+        Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);  \
+}
+#else
 #define GEMM_NN(ALPHA1, BETA_NOT0) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
    while( coordB.y < width0 ); \
    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif

 GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
 GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
            _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result );      \
        }

+#if TYPE == TYPE_HALF
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0);\
+    const int group_y = get_group_id(1);\
+    Dtype8 blockAxB00 = 0;\
+    Dtype8 blockAxB01 = 0;\
+    Dtype8 blockAxB02 = 0;\
+    Dtype8 blockAxB03 = 0;\
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+    do\
+    {\
+        int2    coordBTemp = coordB;\
+        Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\
+        int2    coordATemp = coordA;\
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
 #define GEMM_TN(ALPHA1, BETA_NOT0) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
    while( coordB.y < width0 ); \
    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif

 GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
 GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
@ -324,6 +478,43 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
                  intel_sub_group_shuffle( _block.s6, _col),   \
                  intel_sub_group_shuffle( _block.s7, _col) )

+#if TYPE == TYPE_HALF
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+            _result = mad( (Dtype8)_blockB.s8, acol8, _result );      \
+            _result = mad( (Dtype8)_blockB.s9, acol9, _result );      \
+            _result = mad( (Dtype8)_blockB.sa, acola, _result );      \
+            _result = mad( (Dtype8)_blockB.sb, acolb, _result );      \
+            _result = mad( (Dtype8)_blockB.sc, acolc, _result );      \
+            _result = mad( (Dtype8)_blockB.sd, acold, _result );      \
+            _result = mad( (Dtype8)_blockB.se, acole, _result );      \
+            _result = mad( (Dtype8)_blockB.sf, acolf, _result );      \
+        }
+#else
 #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
        {   \
            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
@ -343,7 +534,51 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
        }
+#endif

+#if TYPE == TYPE_HALF
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype16 blockB00; \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
 #define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
    while( coordB.x < padded_k / VECSIZE ); \
    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
 }
+#endif

+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
        int2 _coordBTemp = _coordB; \
        _coordBTemp.y += get_local_id(0); \
        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+#endif

 #define MATB_PARAMETER __read_only image2d_t B

@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER

+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
+        _coordB.x += TILE_K * 2;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
        int2 _coordBTemp = _coordB; \
        _coordBTemp.y += get_local_id(0); \
        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
        _blockb = vload8(0, B_read); \
        _coordB.x += TILE_K;
+#endif

 #define MATB_PARAMETER __global Dtype *B, int offB, int ldb

@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER

+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s8 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s9 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sa = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sb = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+         _blockb.sc = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sd = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.se = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sf = temp.s0; \
+        _coordB.x += 16;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
        int2 _coordBTemp = _coordB; \
        _coordBTemp.y += get_local_id(0); \
@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
        _blockb.s7 = temp.s0; \
        _coordB.x += 8;
+#endif

 #define MATB_PARAMETER __read_only image2d_t B

@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
        }

+#if TYPE == TYPE_HALF
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;             \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+#else
 #define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
 __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
    while( coordB.x < padded_k / VECSIZE ); \
    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
 }
+#endif

 #define BLOCKB_READ8(_blockb, _B, _coordB) \
        int2 _coordBTemp = _coordB; \
@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
 #undef BLOCKB_READ8
 #undef MATB_PARAMETER

+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
+        _coordB.x += TILE_K;
+#else
 #define BLOCKB_READ8(_blockb, _B, _coordB) \
        int2 _coordBTemp = _coordB; \
        _coordBTemp.y += get_local_id(0); \
        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
        _blockb = vload8(0, B_read); \
        _coordB.x += TILE_K;
+#endif

 #define MATB_PARAMETER __global Dtype *B, int offB, int ldb

@ -598,7 +944,7 @@ GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
 #undef READ_IMAGE
 #undef SIZE_OF_ELEMENT

-__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
+__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
    __global Dtype* A,
    __write_only image2d_t ImA,
    int offA,
@ -611,10 +957,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
    int2 coord_dst = (int2)(gidx, gidy);
    __global Dtype* A_off = A + offA;
    Dtype srcA = A_off[gidy * ldA + gidx];
+#if TYPE == TYPE_HALF
+    write_imageh(ImA, coord_dst, (Dtype4)srcA);
+#else
    write_imagef(ImA, coord_dst, (Dtype4)srcA);
+#endif
 }

-__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
+__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
    __global Dtype* A,
    __write_only image2d_t ImA,
    int offA,
@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
    const int gidx = get_global_id(0);
    const int gidy = get_global_id(1);
    int2 coord_dst = (int2)(gidx, gidy);
+#if TYPE == TYPE_HALF
+    if (gidx >= width || gidy >= height) {
+      write_imageh(ImA, coord_dst, 0);
+      return;
+    }
+    __global Dtype* A_off = A + offA;
+    write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
+#else
    if (gidx >= width || gidy >= height) {
      write_imageui(ImA, coord_dst, (uint4)0);
      return;
@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
    __global Dtype* A_off = A + offA;
    uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
    write_imageui(ImA, coord_dst, srcA);
+#endif
 }
--- a/modules/dnn/src/opencl/math.cl
+++ b/modules/dnn/src/opencl/math.cl
@ -40,16 +40,20 @@
 //
 //M*/

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float

-__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
                                   const int offx, __global Dtype* y,
                                   const int offy) {
  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype src = x[offx + index];
    Dtype dst = y[offy + index];
-    y[offy + index] = alpha * src + dst;
+    y[offy + index] = convert_Dtype(alpha) * src + dst;
  }
 }
--- a/modules/dnn/src/opencl/matvec_mul.cl
+++ b/modules/dnn/src/opencl/matvec_mul.cl
@ -39,41 +39,45 @@
 //
 //M*/

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float

 __kernel void TEMPLATE(matvec_mul4,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
          int offA,
          unsigned int A_col_size,
          unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
          int offv,
-          float alpha,
-          float beta,
-          __global float4 * result,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype4* result,
          int offr,
-          __local float4 * work)
+          __local Dtype4* work)
 {
  unsigned int row_gid = get_group_id(0);
  unsigned int lid = get_local_id(0);
-  const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
-  const __global float *src1_read = v + offv;
-  result = (__global float4*)((__global float*)result + offr);
-  float4 dot0 = (float4)(0.f);
-  float4 dot1 = (float4)(0.f);
-  float4 dot2 = (float4)(0.f);
-  float4 dot3 = (float4)(0.f);
+  const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
+  const __global Dtype *src1_read = v + offv;
+  result = (__global Dtype4*)((__global Dtype*)result + offr);
+  Dtype4 dot0 = (Dtype4)(0.f);
+  Dtype4 dot1 = (Dtype4)(0.f);
+  Dtype4 dot2 = (Dtype4)(0.f);
+  Dtype4 dot3 = (Dtype4)(0.f);

  unsigned int i = lid;
  while( i < A_col_size / 4) {
-    const float4 a0 = vload4(i, src0_read);
-    const float4 a1 = vload4(i, src0_read + A_col_size);
-    const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
-    const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+    const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+    const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);

-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 b0 = vload4(i, src1_read);

    dot0 += a0 * b0;
    dot1 += a1 * b0;
@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
  {
    if(trail_item != 0)
    {
-      const __global float *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
      for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
-        const float at1 = src0_trail[i + A_col_size];
-        const float at2 = src0_trail[i + 2 * A_col_size];
-        const float at3 = src0_trail[i + 3 * A_col_size];
+        const Dtype at0 = src0_trail[i];
+        const Dtype at1 = src0_trail[i + A_col_size];
+        const Dtype at2 = src0_trail[i + 2 * A_col_size];
+        const Dtype at3 = src0_trail[i + 3 * A_col_size];

-        const float bt = src1_trail[i];
+        const Dtype bt = src1_trail[i];

        work[lid].s0 += at0 * bt;
        work[lid].s1 += at1 * bt;
@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
  }
  if(lid == 0) {
    if(beta == (Dtype)0)
-      result[row_gid] = alpha * work[0];
+      result[row_gid] = convert_Dtype(alpha) * work[0];
    else
-      result[row_gid] = alpha * work[0] + beta * result[row_gid];
+      result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
  }
 }

 /* This kernel used for the trailing rows when row_of_A %4 !=0 */
 __kernel void TEMPLATE(matvec_mul1,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
          int offA,
          unsigned int A_col_size,
          unsigned int row_offset,
          unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
          int offv,
-          float alpha,
-          float beta,
-          __global float * result,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype * result,
          int offr,
-          __local float * work)
+          __local Dtype * work)
 {
  unsigned int row_gid = get_group_id(0);
  unsigned int lid = get_local_id(0);

-  const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
-  const __global float *src1_read = v + + offv;
+  const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+  const __global Dtype *src1_read = v + + offv;
  result = result + offr;
-  float4 dot0 = (float4)(0.f);
+  Dtype4 dot0 = (Dtype4)(0.f);

  unsigned int i = lid;
  while( i < A_col_size / 4)
  {
-    const float4 a0 = vload4(i, src0_read);
-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 b0 = vload4(i, src1_read);

    dot0 += a0 * b0;
    i += get_local_size(0);
@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
  {
    if(trail_item != 0)
    {
-      const __global float *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
      for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
-        const float bt = src1_trail[i];
+        const Dtype at0 = src0_trail[i];
+        const Dtype bt = src1_trail[i];

        work[lid] += at0 * bt;
      }
@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(

  if(lid == 0) {
    if(beta == (Dtype)0) {
-      result[row_gid+row_offset] = alpha * work[0];
+      result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
    } else {
-      result[row_gid+row_offset] *= beta;
-      result[row_gid+row_offset] += alpha * work[0];
+      result[row_gid+row_offset] *= convert_Dtype(beta);
+      result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
    }
  }
 }
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@ -40,7 +40,11 @@
 //
 //M*/

-#define Dtype float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define Dtype  float
 #define Dtype4 float4
 #define Dtype8 float8

@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
    store(dst_vec, dst, index);
 }

-__kernel void MEAN_FUSE(__global const Dtype * A,
+__kernel void MEAN_FUSE(__global const T * A,
                        unsigned int A_col_size,
                        float alpha,
-                        __global Dtype4 * result,
-                        __global Dtype * B,
+                        __global T4 * mean,
+                        __global Dtype * tmp,
                        __local Dtype4 * work)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
-    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    const __global T *src0_read = A + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
    Dtype4 dot0, dot1, dot2, dot3;
    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);

@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
    const Dtype4 b0 = (Dtype4)1.f;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);

-        dot0 += a0;
-        dot1 += a1;
-        dot2 += a2;
-        dot3 += a3;
+        dot0 += convert_float4(a0);
+        dot1 += convert_float4(a1);
+        dot2 += convert_float4(a2);
+        dot3 += convert_float4(a3);

        i += get_local_size(0);
    }
@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,

    if(lid == 0)
    {
-        result[row_gid] = alpha * work[0];
+        mean[row_gid] = convert_T(alpha * work[0]);
    }

    Dtype4 sum = work[0] * alpha;
    i = lid;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);

-        dot0 = native_powr(a0 - (Dtype4)sum.x, 2);
-        dot1 = native_powr(a1 - (Dtype4)sum.y, 2);
-        dot2 = native_powr(a2 - (Dtype4)sum.z, 2);
-        dot3 = native_powr(a3 - (Dtype4)sum.w, 2);
+        dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
+        dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
+        dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
+        dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);

        vstore4(dot0, i, dst0_read);
        vstore4(dot1, i, dst0_read + A_col_size);
@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
 }

 __kernel void MVN_FUSE(__global const Dtype * tmp,
-                       __global const Dtype * A,
-                       __global const Dtype4 * mean,
+                       __global const T * A,
+                       __global const T4 * mean,
                       unsigned int A_col_size,
                       const float alpha_val,
                       const float eps,
                       const float relu_slope,
                       __global const Dtype4 * bnorm_weight,
                       __global const Dtype4 * bnorm_bias,
-                       __global Dtype * B,
+                       __global T * B,
                       __local Dtype4 * work)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
    const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
-    const __global Dtype *src1_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    const __global T *src1_read = A + row_gid * 4 * A_col_size;
+    __global T *dst0_read = B + row_gid * 4 * A_col_size;
    Dtype4 dot0, dot1, dot2, dot3;
    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);

@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
    }
    barrier(CLK_LOCAL_MEM_FENCE);

-    Dtype4 mean_val = mean[row_gid];
+    Dtype4 mean_val = convert_float4(mean[row_gid]);
    Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
    Dtype4 alpha = (Dtype4)1.f / dev_val;

@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
    i = lid;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src1_read);
-        const Dtype4 a1 = vload4(i, src1_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size);
+        const T4 a0 = vload4(i, src1_read);
+        const T4 a1 = vload4(i, src1_read + A_col_size);
+        const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src1_read + 3 * A_col_size);

-        dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x;
-        dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y;
-        dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z;
-        dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w;
+        dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
+        dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
+        dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
+        dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;

        dot0 = dot0 * w.x + (Dtype4)b.x;
        dot1 = dot1 * w.y + (Dtype4)b.y;
@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
        dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
 #endif

-        vstore4(dot0, i, dst0_read);
-        vstore4(dot1, i, dst0_read + A_col_size);
-        vstore4(dot2, i, dst0_read + 2 * A_col_size);
-        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+        vstore4(convert_T(dot0), i, dst0_read);
+        vstore4(convert_T(dot1), i, dst0_read + A_col_size);
+        vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
+        vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);

        i += get_local_size(0);
    }
--- a/modules/dnn/src/opencl/ocl4dnn_lrn.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@ -42,14 +42,18 @@

 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
                             const int num, const int channels,
                             const int height, const int width, const int size,
-                             const Dtype alpha_over_size, const Dtype k,
+                             const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
                             __global Dtype* const out,
-                             const Dtype negative_beta) {
+                             const KERNEL_ARG_DTYPE negative_beta) {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
    const int step = height * width;
    __global const Dtype* in_off = in + offset;
    __global Dtype* out_off = out + offset;
-    Dtype scale_val;
+    KERNEL_ARG_DTYPE scale_val;
    int head = 0;
    const int pre_pad = (size - 1) / 2;
    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
+    KERNEL_ARG_DTYPE accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
    while (head < post_pad && head < channels) {
@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
            * in_off[(head - size) * step];
      }
      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
      ++head;
    }
    // subtract only
@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
            * in_off[(head - size) * step];
      }
      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
      ++head;
    }
  }
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@ -42,7 +42,10 @@

 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 #if defined KERNEL_MAX_POOL

--- a/modules/dnn/src/opencl/permute.cl
+++ b/modules/dnn/src/opencl/permute.cl
@ -40,7 +40,9 @@
 //
 //M*/

-#define Dtype float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 __kernel void permute(const int nthreads,
                      __global Dtype* bottom_data,
--- a/modules/dnn/src/opencl/prior_box.cl
+++ b/modules/dnn/src/opencl/prior_box.cl
@ -39,17 +39,18 @@
 //
 //M*/

-#define Dtype float
-#define Dtype4 float4
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 __kernel void prior_box(const int nthreads,
-                        const Dtype stepX,
-                        const Dtype stepY,
-                        __global const Dtype* _offsetsX,
-                        __global const Dtype* _offsetsY,
+                        const float stepX,
+                        const float stepY,
+                        __global const float* _offsetsX,
+                        __global const float* _offsetsY,
                        const int offsetsX_size,
-                        __global const Dtype* _widths,
-                        __global const Dtype* _heights,
+                        __global const float* _widths,
+                        __global const float* _heights,
                        const int widths_size,
                        __global Dtype* dst,
                        const int _layerHeight,
@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,

        outputPtr = dst + index * 4 * offsetsX_size * widths_size;

-        Dtype _boxWidth, _boxHeight;
+        float _boxWidth, _boxHeight;
        Dtype4 vec;
        for (int i = 0; i < widths_size; ++i)
        {
@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
            _boxHeight = _heights[i];
            for (int j = 0; j < offsetsX_size; ++j)
            {
-                float center_x = (w + _offsetsX[j]) * stepX;
-                float center_y = (h + _offsetsY[j]) * stepY;
+                Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
+                Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;

                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
 __kernel void set_variance(const int nthreads,
                           const int offset,
                           const int variance_size,
-                           __global const Dtype* variance,
+                           __global const float* variance,
                           __global Dtype* dst)
 {
    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
        if (variance_size == 1)
            var_vec = (Dtype4)(variance[0]);
        else
-            var_vec = vload4(0, variance);
+            var_vec = convert_T(vload4(0, variance));

        vstore4(var_vec, 0, dst + offset + index * 4);
    }
--- a/modules/dnn/src/opencl/reorg.cl
+++ b/modules/dnn/src/opencl/reorg.cl
@ -39,6 +39,10 @@
 //
 //M*/

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void reorg(const int count,
                    __global const Dtype* src,
                    const int channels,
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@ -40,9 +40,9 @@
 //
 //M*/

-#define Dtype float
-#define Dtype4 float4
-#define Dtype8 float8
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif

 __kernel void slice(__global const Dtype* src,
                    const int src_plane_size,
--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
@ -24,6 +24,10 @@
 * POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void kernel_channel_max(const int num, const int channels,
    const int spatial_dim, __global const T* data, __global T* out) {
  int index = get_global_id(0);
@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,

 __kernel void kernel_channel_subtract(const int count,
    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
  int index = get_global_id(0);
  if(index < count) {
    int n = index / channels / spatial_dim;
    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
+    data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
  }
 }

--- a/modules/dnn/src/opencl/softmax_loss.cl
+++ b/modules/dnn/src/opencl/softmax_loss.cl
@ -42,12 +42,15 @@

 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float

 #if defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
 #endif

+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
                                   const int spatial_dim,
                                   __global Dtype* scale,
@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
  int n = get_global_id(1);
  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
      get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
      maxval = max((Dtype)tmp, (Dtype)maxval);
    }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
    //if (get_sub_group_local_id() == 0)
    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
  }
@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    int s = index / get_max_sub_group_size();
    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = maxval / 100000;
+    scale_tmp[s] = maxval;
  }

  barrier(CLK_LOCAL_MEM_FENCE);
@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      sum += out_tmp[c * spatial_dim + s];
    }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    int s = index / get_max_sub_group_size();
    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = sum / 100000;
+    scale_tmp[s] = sum;
  }
  barrier(CLK_LOCAL_MEM_FENCE);

@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
  __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
      get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
      maxval = max((Dtype)tmp, (Dtype)maxval);
    }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
    //if (get_sub_group_local_id() == 0)
    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
  }
@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    int s = index / get_max_sub_group_size();
    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = maxval / 100000;
+    scale[n * spatial_dim + s] = maxval;
  }

  barrier(CLK_GLOBAL_MEM_FENCE);
@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      sum += out[n * channels * spatial_dim + c * spatial_dim + s];
    }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
  }
  barrier(CLK_GLOBAL_MEM_FENCE);
@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    int s = index / get_max_sub_group_size();
    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = sum / 100000;
+    scale[n * spatial_dim + s] = sum;
  }
  barrier(CLK_GLOBAL_MEM_FENCE);

--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@ -64,6 +64,7 @@

 namespace cv { namespace dnn {
 CV__DNN_EXPERIMENTAL_NS_BEGIN
+#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
 Mutex& getInitializationMutex();
 void initializeLayerFactory();
 CV__DNN_EXPERIMENTAL_NS_END
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@ -538,6 +538,37 @@ public:
    }
 };

+// In case of resizing by factor.
+class ResizeBilinearSubgraph : public Subgraph
+{
+public:
+    ResizeBilinearSubgraph()
+    {
+        int input = addNodeToMatch("");
+
+        int shape = addNodeToMatch("Shape", input);
+        int stack = addNodeToMatch("Const");
+        int stack_1 = addNodeToMatch("Const");
+        int stack_2 = addNodeToMatch("Const");
+        int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+        int factorY = addNodeToMatch("Const");
+        int mul = addNodeToMatch("Mul", strided_slice, factorY);
+
+        shape = addNodeToMatch("Shape", input);
+        stack = addNodeToMatch("Const");
+        stack_1 = addNodeToMatch("Const");
+        stack_2 = addNodeToMatch("Const");
+        strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+        int factorX = addNodeToMatch("Const");
+        int mul_1 = addNodeToMatch("Mul", strided_slice, factorX);
+
+        int pack = addNodeToMatch("Pack", mul, mul_1);
+
+        addNodeToMatch("ResizeBilinear", input, pack);
+        setFusedNode("ResizeBilinear", input, factorY, factorX);
+    }
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
    std::vector<Ptr<Subgraph> > subgraphs;
@ -551,6 +582,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
    subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
    subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
    subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new ResizeBilinearSubgraph()));

    int numNodes = net.node_size();
    std::vector<int> matchedNodesIds;
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -767,6 +767,26 @@ void TFImporter::populateNet(Net dstNet)
                }
            }
        }
+        else if (type == "Sub")
+        {
+            bool haveConst = false;
+            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            {
+                Pin input = parsePin(layer.input(ii));
+                haveConst = value_id.find(input.name) != value_id.end();
+            }
+            CV_Assert(haveConst);
+
+            layerParams.blobs.resize(1);
+            blobFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]);
+            layerParams.blobs[0] *= -1;
+
+            int id = dstNet.addLayer(name, "Shift", layerParams);
+            layer_id[name] = id;
+
+            // one input only
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        }
        else if (type == "MatMul")
        {
            CV_Assert(layer.input_size() == 2);
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -147,7 +147,9 @@ TEST_P(DNNTestNetwork, Inception_5h)

 TEST_P(DNNTestNetwork, ENet)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException("");
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
    processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
                                             "dnn/halide_scheduler_enet.yml",
@ -161,9 +163,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.0007 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.011 : 0.0;

    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
@ -173,15 +177,17 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.06 : 0.0;
    processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, SSD_VGG16)
 {
-    if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
-        backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+    if ((backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU))
        throw SkipTestException("");
    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
               "dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
@ -236,14 +242,17 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
+    float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.07 : 0.0;
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
-               inp, "detection_out");
+               inp, "detection_out", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+    if ((backend == DNN_BACKEND_HALIDE) ||
+        (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
 }
@ -258,7 +267,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
 };

 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -104,7 +104,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
        ASSERT_FALSE(net.empty());
    }

-    net.setPreferableTarget(get<1>(GetParam()));
+    int targetId = get<1>(GetParam());
+    const float l1 = 1e-5;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
+
+    net.setPreferableTarget(targetId);

    Mat sample = imread(_tf("grace_hopper_227.png"));
    ASSERT_TRUE(!sample.empty());
@ -112,10 +116,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
    net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data");
    Mat out = net.forward("prob");
    Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
-    normAssert(ref, out);
+    normAssert(ref, out, "", l1, lInf);
 }

-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(), availableDnnTargets()));
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(),
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)));

 #if !defined(_WIN32) || defined(_WIN64)
 TEST(Reproducibility_FCN, Accuracy)
@ -176,8 +181,11 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
    const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
    Net net = readNetFromCaffe(proto, model);
+    int targetId = GetParam();
+    const float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 1.5e-4 : 1e-5;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-4;

-    net.setPreferableTarget(GetParam());
+    net.setPreferableTarget(targetId);

    Mat sample = imread(_tf("street.png"));

@ -185,8 +193,10 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    net.setInput(inp);
    Mat out = net.forward();

+    const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
+    const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 5e-3 : 1e-4;
    Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
-    normAssertDetections(ref, out);
+    normAssertDetections(ref, out, "", 0.0, scores_diff, boxes_iou_diff);

    // Check that detections aren't preserved.
    inp.setTo(0.0f);
@ -212,10 +222,12 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    // a single sample in batch. The first numbers of detection vectors are batch id.
    outBatch = outBatch.reshape(1, outBatch.total() / 7);
    EXPECT_EQ(outBatch.rows, 2 * numDetections);
-    normAssert(outBatch.rowRange(0, numDetections), ref);
-    normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7));
+    normAssert(outBatch.rowRange(0, numDetections), ref, "", l1, lInf);
+    normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7),
+               "", l1, lInf);
 }
-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));

 typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
 TEST_P(Reproducibility_ResNet50, Accuracy)
@ -226,6 +238,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
    int targetId = GetParam();
    net.setPreferableTarget(targetId);

+    float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
+    float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
+
    Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
    ASSERT_TRUE(!input.empty());

@ -233,20 +248,21 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
    Mat out = net.forward();

    Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
-    normAssert(ref, out);
+    normAssert(ref, out, "", l1, lInf);

-    if (targetId == DNN_TARGET_OPENCL)
+    if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
    {
        UMat out_umat;
        net.forward(out_umat);
-        normAssert(ref, out_umat, "out_umat");
+        normAssert(ref, out_umat, "out_umat", l1, lInf);

        std::vector<UMat> out_umats;
        net.forward(out_umats);
-        normAssert(ref, out_umats[0], "out_umat_vector");
+        normAssert(ref, out_umats[0], "out_umat_vector", l1, lInf);
    }
 }
-INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, availableDnnTargets());
+INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));

 typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
 TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -295,26 +295,32 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)

 INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());

+typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_fp16;
+
+TEST_P(Test_TensorFlow_fp16, tests)
+{
+    int targetId = GetParam();
+    const float l1 = 7e-4;
+    const float lInf = 1e-2;
+    runTensorFlowNet("fp16_single_conv", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_deconvolution", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_same", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_padding_valid", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_eltwise_add_mul", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_odd_valid", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_pad_and_concat", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_max_pool_even", targetId, false, l1, lInf);
+    runTensorFlowNet("fp16_padding_same", targetId, false, l1, lInf);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_fp16,
+                        Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
+
 TEST(Test_TensorFlow, defun)
 {
    runTensorFlowNet("defun_dropout");
 }

-TEST(Test_TensorFlow, fp16)
-{
-    const float l1 = 1e-3;
-    const float lInf = 1e-2;
-    runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf);
-    runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf);
-}
-
 TEST(Test_TensorFlow, quantized)
 {
    runTensorFlowNet("uint8_single_conv");
@ -373,9 +379,24 @@ public:
    ResizeBilinearLayer(const LayerParams &params) : Layer(params)
    {
        CV_Assert(!params.get<bool>("align_corners", false));
-        CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1);
-        outHeight = blobs[0].at<int>(0, 0);
-        outWidth = blobs[0].at<int>(0, 1);
+        CV_Assert(!blobs.empty());
+
+        for (size_t i = 0; i < blobs.size(); ++i)
+            CV_Assert(blobs[i].type() == CV_32SC1);
+
+        if (blobs.size() == 1)
+        {
+            CV_Assert(blobs[0].total() == 2);
+            outHeight = blobs[0].at<int>(0, 0);
+            outWidth = blobs[0].at<int>(0, 1);
+        }
+        else
+        {
+            CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1);
+            factorHeight = blobs[0].at<int>(0, 0);
+            factorWidth = blobs[1].at<int>(0, 0);
+            outHeight = outWidth = 0;
+        }
    }

    static Ptr<Layer> create(LayerParams& params)
@ -391,12 +412,21 @@ public:
        std::vector<int> outShape(4);
        outShape[0] = inputs[0][0];  // batch size
        outShape[1] = inputs[0][1];  // number of channels
-        outShape[2] = outHeight;
-        outShape[3] = outWidth;
+        outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
+        outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
        outputs.assign(1, outShape);
        return false;
    }

+    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        if (!outWidth && !outHeight)
+        {
+            outHeight = outputs[0].size[2];
+            outWidth = outputs[0].size[3];
+        }
+    }
+
    // This implementation is based on a reference implementation from
    // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
@ -447,13 +477,51 @@ private:
        return x + size[3] * (y + size[2] * (c + size[1] * b));
    }

-    int outWidth, outHeight;
+    int outWidth, outHeight, factorWidth, factorHeight;
 };

 TEST(Test_TensorFlow, resize_bilinear)
 {
    CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
    runTensorFlowNet("resize_bilinear");
+    runTensorFlowNet("resize_bilinear_factor");
+    LayerFactory::unregisterLayer("ResizeBilinear");
+}
+
+// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png')
+// inp = inp[:,:,[2, 1, 0]].astype(np.float32).reshape(1, 512, 512, 3)
+// outs = sess.run([sess.graph.get_tensor_by_name('feature_fusion/Conv_7/Sigmoid:0'),
+//                  sess.graph.get_tensor_by_name('feature_fusion/concat_3:0')],
+//                 feed_dict={'input_images:0': inp})
+// scores = np.ascontiguousarray(outs[0].transpose(0, 3, 1, 2))
+// geometry = np.ascontiguousarray(outs[1].transpose(0, 3, 1, 2))
+// np.save('east_text_detection.scores.npy', scores)
+// np.save('east_text_detection.geometry.npy', geometry)
+TEST(Test_TensorFlow, EAST_text_detection)
+{
+    CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
+    std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false);
+    std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false);
+    std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false);
+    std::string refGeometryPath = findDataFile("dnn/east_text_detection.geometry.npy", false);
+
+    Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false));
+
+    Mat img = imread(imgPath);
+    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
+    net.setInput(inp);
+
+    std::vector<Mat> outs;
+    std::vector<String> outNames(2);
+    outNames[0] = "feature_fusion/Conv_7/Sigmoid";
+    outNames[1] = "feature_fusion/concat_3";
+    net.forward(outs, outNames);
+
+    Mat scores = outs[0];
+    Mat geometry = outs[1];
+
+    normAssert(scores, blobFromNPY(refScoresPath), "scores");
+    normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 5e-5, 1e-3);
    LayerFactory::unregisterLayer("ResizeBilinear");
 }

--- a/modules/imgproc/src/connectedcomponents.cpp
+++ b/modules/imgproc/src/connectedcomponents.cpp
@ -503,7 +503,7 @@ namespace cv{
            // +-+-+-+
            // |p|q|r|
            // +-+-+-+
-            //	 |x|
+            //   |x|
            //   +-+
            const int w = imgLabels.cols, h = imgLabels.rows;

@ -548,7 +548,7 @@ namespace cv{
            // +-+-+-+
            // |-|q|-|
            // +-+-+-+
-            //	 |x|
+            //   |x|
            //   +-+
            const int w = imgLabels.cols, h = imgLabels.rows;

@ -2473,9 +2473,9 @@ namespace cv{
                // |P -|Q -|R -|
                // |- -|- -|- -|
                // +---+---+---+
-                //	   |X -|
-                //	   |- -|
-                //	   +---+
+                //     |X -|
+                //     |- -|
+                //     +---+
                const int w = imgLabels.cols, h = imgLabels.rows;

                for (int r = chunksSizeAndLabels[0]; r < h; r = chunksSizeAndLabels[r]){
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@ -219,13 +219,15 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
        }
    }

-    // Get rid of dupes
+    // Get rid of dupes and order points.
    for( int i = 0; i < (int)intersection.size()-1; i++ )
    {
+        float dx1 = intersection[i + 1].x - intersection[i].x;
+        float dy1 = intersection[i + 1].y - intersection[i].y;
        for( size_t j = i+1; j < intersection.size(); j++ )
        {
-            float dx = intersection[i].x - intersection[j].x;
-            float dy = intersection[i].y - intersection[j].y;
+            float dx = intersection[j].x - intersection[i].x;
+            float dy = intersection[j].y - intersection[i].y;
            double d2 = dx*dx + dy*dy; // can be a really small number, need double here

            if( d2 < samePointEps*samePointEps )
@ -235,6 +237,12 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
                intersection.pop_back();
                j--; // restart check
            }
+            else if (dx1 * dy - dy1 * dx < 0)
+            {
+                std::swap(intersection[i + 1], intersection[j]);
+                dx1 = dx;
+                dy1 = dy;
+            }
        }
    }

--- a/modules/imgproc/test/test_intersection.cpp
+++ b/modules/imgproc/test/test_intersection.cpp
@ -66,8 +66,27 @@ private:
    void test7();
    void test8();
    void test9();
+    void test10();
+    void test11();
+    void test12();
+    void test13();
+    void test14();
 };

+static void compare(const std::vector<Point2f>& test, const std::vector<Point2f>& target)
+{
+    ASSERT_EQ(test.size(), target.size());
+    ASSERT_TRUE(test.size() < 4 || isContourConvex(test));
+    ASSERT_TRUE(target.size() < 4 || isContourConvex(target));
+    for( size_t i = 0; i < test.size(); i++ )
+    {
+        double dx = test[i].x - target[i].x;
+        double dy = test[i].y - target[i].y;
+        double r = sqrt(dx*dx + dy*dy);
+        ASSERT_LT(r, ACCURACY);
+    }
+}
+
 void CV_RotatedRectangleIntersectionTest::run(int)
 {
    // See pics/intersection.png for the scenarios we are testing
@ -92,28 +111,20 @@ void CV_RotatedRectangleIntersectionTest::run(int)
    test7();
    test8();
    test9();
+    test10();
+    test11();
+    test12();
+    test13();
+    test14();
 }

 void CV_RotatedRectangleIntersectionTest::test1()
 {
    // no intersection
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 12.0f;
-
-    rect2.center.x = 10;
-    rect2.center.y = 10;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 34.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 12.0f);
+    RotatedRect rect2(Point2f(10, 10), Size2f(2, 2), 34.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_NONE);
@ -123,375 +134,243 @@ void CV_RotatedRectangleIntersectionTest::test1()
 void CV_RotatedRectangleIntersectionTest::test2()
 {
    // partial intersection, rectangles translated
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 1;
-    rect2.center.y = 1;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(1, 1), Size2f(2, 2), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 4);

-    vector<Point2f> possibleVertices(4);
-
-    possibleVertices[0] = Point2f(0.0f, 0.0f);
-    possibleVertices[1] = Point2f(1.0f, 1.0f);
-    possibleVertices[2] = Point2f(0.0f, 1.0f);
-    possibleVertices[3] = Point2f(1.0f, 0.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(1.0f, 0.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    targetVertices[2] = Point2f(0.0f, 1.0f);
+    targetVertices[3] = Point2f(0.0f, 0.0f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test3()
 {
    // partial intersection, rectangles rotated 45 degree on the corner, forms a triangle intersection
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 1;
-    rect2.center.y = 1;
-    rect2.size.width = sqrt(2.0f);
-    rect2.size.height = 20;
-    rect2.angle = 45.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(1, 1), Size2f(sqrt(2.0f), 20), 45.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 3);

-    vector<Point2f> possibleVertices(3);
-
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(0.0f, 1.0f);
-    possibleVertices[2] = Point2f(1.0f, 0.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(3);
+    targetVertices[0] = Point2f(1.0f, 0.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    targetVertices[2] = Point2f(0.0f, 1.0f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test4()
 {
    // full intersection, rectangles of same size directly on top of each other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_FULL);
-    CV_Assert(vertices.size() == 4);

-    vector<Point2f> possibleVertices(4);
-
-    possibleVertices[0] = Point2f(-1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, 1.0f);
+    targetVertices[1] = Point2f(-1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test5()
 {
    // partial intersection, rectangle on top rotated 45 degrees
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 45.0f;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 45.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 8);

-    vector<Point2f> possibleVertices(8);
-
-    possibleVertices[0] = Point2f(-1.0f, -0.414214f);
-    possibleVertices[1] = Point2f(-1.0f, 0.414214f);
-    possibleVertices[2] = Point2f(-0.414214f, -1.0f);
-    possibleVertices[3] = Point2f(0.414214f, -1.0f);
-    possibleVertices[4] = Point2f(1.0f, -0.414214f);
-    possibleVertices[5] = Point2f(1.0f, 0.414214f);
-    possibleVertices[6] = Point2f(0.414214f, 1.0f);
-    possibleVertices[7] = Point2f(-0.414214f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(8);
+    targetVertices[0] = Point2f(-1.0f, -0.414214f);
+    targetVertices[1] = Point2f(-0.414214f, -1.0f);
+    targetVertices[2] = Point2f(0.414214f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, -0.414214f);
+    targetVertices[4] = Point2f(1.0f, 0.414214f);
+    targetVertices[5] = Point2f(0.414214f, 1.0f);
+    targetVertices[6] = Point2f(-0.414214f, 1.0f);
+    targetVertices[7] = Point2f(-1.0f, 0.414214f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test6()
 {
    // 6 - partial intersection, rectangle on top of different size
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 10;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 10), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 4);

-    vector<Point2f> possibleVertices(4);
-
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(-1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, -1.0f);
+    targetVertices[1] = Point2f(1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, 1.0f);
+    targetVertices[3] = Point2f(-1.0f, 1.0f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test7()
 {
    // full intersection, rectangle fully enclosed in the other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 12.34f;
-    rect1.size.height = 56.78f;
-    rect1.angle = 0;
-
-    rect2.center.x = 0;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(12.34f, 56.78f), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_FULL);
-    CV_Assert(vertices.size() == 4);

-    vector<Point2f> possibleVertices(4);
-
-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
-    possibleVertices[2] = Point2f(-1.0f, -1.0f);
-    possibleVertices[3] = Point2f(-1.0f, 1.0f);
-
-    for( size_t i = 0; i < vertices.size(); i++ )
-    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
-        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
-        }
-
-        CV_Assert(bestR < ACCURACY);
-    }
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-1.0f, 1.0f);
+    targetVertices[1] = Point2f(-1.0f, -1.0f);
+    targetVertices[2] = Point2f(1.0f, -1.0f);
+    targetVertices[3] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
 }

 void CV_RotatedRectangleIntersectionTest::test8()
 {
-    // full intersection, rectangle fully enclosed in the other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 2;
-    rect2.center.y = 2;
-    rect2.size.width = 2;
-    rect2.size.height = 2;
-    rect2.angle = 0;
+    // intersection by a single vertex
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(2, 2), Size2f(2, 2), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 1);
-
-    double dx = vertices[0].x - 1;
-    double dy = vertices[0].y - 1;
-    double r = sqrt(dx*dx + dy*dy);
-
-    CV_Assert(r < ACCURACY);
+    compare(vertices, vector<Point2f>(1, Point2f(1.0f, 1.0f)));
 }

 void CV_RotatedRectangleIntersectionTest::test9()
 {
    // full intersection, rectangle fully enclosed in the other
-
-    RotatedRect rect1, rect2;
-
-    rect1.center.x = 0;
-    rect1.center.y = 0;
-    rect1.size.width = 2;
-    rect1.size.height = 2;
-    rect1.angle = 0;
-
-    rect2.center.x = 2;
-    rect2.center.y = 0;
-    rect2.size.width = 2;
-    rect2.size.height = 123.45f;
-    rect2.angle = 0;
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(2, 0), Size2f(2, 123.45f), 0.0f);

    vector<Point2f> vertices;
-
    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);

    CV_Assert(ret == INTERSECT_PARTIAL);
-    CV_Assert(vertices.size() == 2);

-    vector<Point2f> possibleVertices(2);
+    vector<Point2f> targetVertices(2);
+    targetVertices[0] = Point2f(1.0f, -1.0f);
+    targetVertices[1] = Point2f(1.0f, 1.0f);
+    compare(vertices, targetVertices);
+}

-    possibleVertices[0] = Point2f(1.0f, 1.0f);
-    possibleVertices[1] = Point2f(1.0f, -1.0f);
+void CV_RotatedRectangleIntersectionTest::test10()
+{
+    // three points of rect2 are inside rect1.
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0.5), Size2f(1, 1), 45.0f);

-    for( size_t i = 0; i < vertices.size(); i++ )
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
+
+    CV_Assert(ret == INTERSECT_PARTIAL);
+
+    vector<Point2f> targetVertices(5);
+    targetVertices[0] = Point2f(0.207107f, 1.0f);
+    targetVertices[1] = Point2f(-0.207107f, 1.0f);
+    targetVertices[2] = Point2f(-0.707107f, 0.5f);
+    targetVertices[3] = Point2f(0.0f, -0.207107f);
+    targetVertices[4] = Point2f(0.707107f, 0.5f);
+    compare(vertices, targetVertices);
+}
+
+void CV_RotatedRectangleIntersectionTest::test11()
+{
+    RotatedRect rect1(Point2f(0, 0), Size2f(4, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 0), Size2f(2, 2), -45.0f);
+
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
+
+    CV_Assert(ret == INTERSECT_PARTIAL);
+
+    vector<Point2f> targetVertices(6);
+    targetVertices[0] = Point2f(-0.414214f, -1.0f);
+    targetVertices[1] = Point2f(0.414213f, -1.0f);
+    targetVertices[2] = Point2f(1.41421f, 0.0f);
+    targetVertices[3] = Point2f(0.414214f, 1.0f);
+    targetVertices[4] = Point2f(-0.414213f, 1.0f);
+    targetVertices[5] = Point2f(-1.41421f, 0.0f);
+    compare(vertices, targetVertices);
+}
+
+void CV_RotatedRectangleIntersectionTest::test12()
+{
+    RotatedRect rect1(Point2f(0, 0), Size2f(2, 2), 0.0f);
+    RotatedRect rect2(Point2f(0, 1), Size2f(1, 1), 0.0f);
+
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
+
+    CV_Assert(ret == INTERSECT_PARTIAL);
+
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-0.5f, 1.0f);
+    targetVertices[1] = Point2f(-0.5f, 0.5f);
+    targetVertices[2] = Point2f(0.5f, 0.5f);
+    targetVertices[3] = Point2f(0.5f, 1.0f);
+    compare(vertices, targetVertices);
+}
+
+void CV_RotatedRectangleIntersectionTest::test13()
+{
+    RotatedRect rect1(Point2f(0, 0), Size2f(1, 3), 0.0f);
+    RotatedRect rect2(Point2f(0, 1), Size2f(3, 1), 0.0f);
+
+    vector<Point2f> vertices;
+    int ret = rotatedRectangleIntersection(rect1, rect2, vertices);
+
+    CV_Assert(ret == INTERSECT_PARTIAL);
+
+    vector<Point2f> targetVertices(4);
+    targetVertices[0] = Point2f(-0.5f, 0.5f);
+    targetVertices[1] = Point2f(0.5f, 0.5f);
+    targetVertices[2] = Point2f(0.5f, 1.5f);
+    targetVertices[3] = Point2f(-0.5f, 1.5f);
+    compare(vertices, targetVertices);
+}
+
+void CV_RotatedRectangleIntersectionTest::test14()
+{
+    const int kNumTests = 100;
+    const int kWidth = 5;
+    const int kHeight = 5;
+    RotatedRect rects[2];
+    std::vector<Point2f> inter;
+    for (int i = 0; i < kNumTests; ++i)
    {
-        double bestR = DBL_MAX;
-
-        for( size_t j = 0; j < possibleVertices.size(); j++ )
+        for (int j = 0; j < 2; ++j)
        {
-            double dx = vertices[i].x - possibleVertices[j].x;
-            double dy = vertices[i].y - possibleVertices[j].y;
-            double r = sqrt(dx*dx + dy*dy);
-
-            bestR = std::min(bestR, r);
+            rects[j].center = Point2f((float)(rand() % kWidth), (float)(rand() % kHeight));
+            rects[j].size = Size2f(rand() % kWidth + 1.0f, rand() % kHeight + 1.0f);
+            rects[j].angle = (float)(rand() % 360);
        }
-
-        CV_Assert(bestR < ACCURACY);
+        rotatedRectangleIntersection(rects[0], rects[1], inter);
+        ASSERT_TRUE(inter.size() < 4 || isContourConvex(inter));
    }
 }

--- a/modules/imgproc/test/test_thresh.cpp
+++ b/modules/imgproc/test/test_thresh.cpp
@ -420,4 +420,18 @@ void CV_ThreshTest::prepare_to_validation( int /*test_case_idx*/ )

 TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); }

+BIGDATA_TEST(Imgproc_Threshold, huge)
+{
+    Mat m(65000, 40000, CV_8U);
+    ASSERT_FALSE(m.isContinuous());
+
+    uint64 i, n = (uint64)m.rows*m.cols;
+    for( i = 0; i < n; i++ )
+        m.data[i] = (uchar)(i & 255);
+
+    cv::threshold(m, m, 127, 255, cv::THRESH_BINARY);
+    int nz = cv::countNonZero(m);  // FIXIT 'int' is not enough here (overflow is possible with other inputs)
+    ASSERT_EQ((uint64)nz, n / 2);
+}
+
 }} // namespace
--- a/modules/photo/src/seamless_cloning_impl.cpp
+++ b/modules/photo/src/seamless_cloning_impl.cpp
@ -251,13 +251,15 @@ void Cloning::initVariables(const Mat &destination, const Mat &binaryMask)
    //init of the filters used in the dst
    const int w = destination.cols;
    filter_X.resize(w - 2);
+    double scale = CV_PI / (w - 1);
    for(int i = 0 ; i < w-2 ; ++i)
-        filter_X[i] = 2.0f * std::cos(static_cast<float>(CV_PI) * (i + 1) / (w - 1));
+        filter_X[i] = 2.0f * (float)std::cos(scale * (i + 1));

    const int h  = destination.rows;
    filter_Y.resize(h - 2);
+    scale = CV_PI / (h - 1);
    for(int j = 0 ; j < h - 2 ; ++j)
-        filter_Y[j] = 2.0f * std::cos(static_cast<float>(CV_PI) * (j + 1) / (h - 1));
+        filter_Y[j] = 2.0f * (float)std::cos(scale * (j + 1));
 }

 void Cloning::computeDerivatives(const Mat& destination, const Mat &patch, const Mat &binaryMask)
--- a/modules/photo/test/test_cloning.cpp
+++ b/modules/photo/test/test_cloning.cpp
@ -53,7 +53,7 @@ namespace opencv_test { namespace {
 #define SAVE(x)
 #endif

-static const double numerical_precision = 1000.;
+static const double numerical_precision = 0.05; // 95% of pixels should have exact values

 TEST(Photo_SeamlessClone_normal, regression)
 {
@ -82,8 +82,10 @@ TEST(Photo_SeamlessClone_normal, regression)

    SAVE(result);

-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 TEST(Photo_SeamlessClone_mixed, regression)
@ -113,9 +115,10 @@ TEST(Photo_SeamlessClone_mixed, regression)
    Mat reference = imread(reference_path);
    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;

-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 TEST(Photo_SeamlessClone_featureExchange, regression)
@ -145,9 +148,10 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
    Mat reference = imread(reference_path);
    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;

-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 TEST(Photo_SeamlessClone_colorChange, regression)
@ -171,9 +175,10 @@ TEST(Photo_SeamlessClone_colorChange, regression)
    Mat reference = imread(reference_path);
    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;

-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 TEST(Photo_SeamlessClone_illuminationChange, regression)
@ -195,9 +200,12 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
    SAVE(result);

    Mat reference = imread(reference_path);
-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;

+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 TEST(Photo_SeamlessClone_textureFlattening, regression)
@ -221,9 +229,10 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
    Mat reference = imread(reference_path);
    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;

-    double error = cvtest::norm(reference, result, NORM_L1);
-    EXPECT_LE(error, numerical_precision);
-
+    double errorINF = cvtest::norm(reference, result, NORM_INF);
+    EXPECT_LE(errorINF, 1);
+    double errorL1 = cvtest::norm(reference, result, NORM_L1);
+    EXPECT_LE(errorL1, reference.total() * numerical_precision) << "size=" << reference.size();
 }

 }} // namespace
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@ -661,7 +661,7 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
        }

        // Set destination Mats to 0 so new image can be blended
-        for (size_t i = 0; i < num_bands_ + 1; ++i)
+        for (size_t i = 0; i < (size_t)(num_bands_ + 1); ++i)
        {
            gpu_dst_band_weights_[i].setTo(0);
            gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@ -11,6 +11,7 @@
 namespace cvtest {
 void checkIppStatus();
 extern bool skipUnstableTests;
+extern bool runBigDataTests;
 extern int testThreads;
 }

@ -43,7 +44,7 @@ extern int testThreads;


 #undef TEST
-#define TEST(test_case_name, test_name) \
+#define TEST_(test_case_name, test_name, BODY_IMPL) \
    class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\
     public:\
      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
@ -65,9 +66,37 @@ extern int testThreads;
            ::testing::Test::TearDownTestCase, \
            new ::testing::internal::TestFactoryImpl<\
                GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() CV__TEST_BODY_IMPL( #test_case_name "_" #test_name ) \
+    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body()

+#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL)
+
+#define CV__TEST_BIGDATA_BODY_IMPL(name) \
+    { \
+       if (!cvtest::runBigDataTests) \
+       { \
+           printf("[     SKIP ] BigData tests are disabled\n"); \
+           return; \
+       } \
+       CV__TRACE_APP_FUNCTION_NAME(name); \
+       try { \
+          CV__TEST_INIT \
+          Body(); \
+          CV__TEST_CLEANUP \
+       } \
+       catch (cvtest::SkipTestException& e) \
+       { \
+          printf("[     SKIP ] %s\n", e.what()); \
+       } \
+    } \
+
+// Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb)
+#if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__)
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#else
+#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL)
+#endif
+
 #undef TEST_F
 #define TEST_F(test_fixture, test_name)\
    class GTEST_TEST_CLASS_NAME_(test_fixture, test_name) : public test_fixture {\
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -699,6 +699,7 @@ void checkIppStatus()
 }

 bool skipUnstableTests = false;
+bool runBigDataTests = false;
 int testThreads = 0;

 void parseCustomOptions(int argc, char **argv)
@ -708,6 +709,7 @@ void parseCustomOptions(int argc, char **argv)
        "{ test_seed          |809564   |seed for random numbers generator }"
        "{ test_threads       |-1       |the number of worker threads, if parallel execution is enabled}"
        "{ skip_unstable      |false    |skip unstable tests }"
+        "{ test_bigdata       |false    |run BigData tests (>=2Gb) }"
        "{ h   help           |false    |print help info                          }";

    cv::CommandLineParser parser(argc, argv, command_line_keys);
@ -730,6 +732,7 @@ void parseCustomOptions(int argc, char **argv)
    testThreads = parser.get<int>("test_threads");

    skipUnstableTests = parser.get<bool>("skip_unstable");
+    runBigDataTests = parser.get<bool>("test_bigdata");
 }


--- a/Show More
+++ b/Show More