Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-07-29 08:46:45 +08:00 · 2018-09-04 19:35:38 +03:00 · 2018-09-04 19:35:38 +03:00 · d74b98c3d9
commit d74b98c3d9
parent e0bfe44cac 351ee2e3a5
117 changed files with 6913 additions and 3713 deletions
--- a/cmake/OpenCVFindIPPAsync.cmake
+++ b/cmake/OpenCVFindIPPAsync.cmake
@ -1,45 +0,0 @@
-# Main variables:
-# IPP_A_LIBRARIES and IPP_A_INCLUDE to use IPP Async
-# HAVE_IPP_A for conditional compilation OpenCV with/without IPP Async
-
-# IPP_ASYNC_ROOT - root of IPP Async installation
-
-if(X86_64)
-    find_path(
-    IPP_A_INCLUDE_DIR
-    NAMES ipp_async_defs.h
-    PATHS $ENV{IPP_ASYNC_ROOT}
-    PATH_SUFFIXES include
-    DOC "Path to Intel IPP Async interface headers")
-
-    find_file(
-    IPP_A_LIBRARIES
-    NAMES ipp_async_preview.lib
-    PATHS $ENV{IPP_ASYNC_ROOT}
-    PATH_SUFFIXES lib/intel64
-    DOC "Path to Intel IPP Async interface libraries")
-
-else()
-    find_path(
-    IPP_A_INCLUDE_DIR
-    NAMES ipp_async_defs.h
-    PATHS $ENV{IPP_ASYNC_ROOT}
-    PATH_SUFFIXES include
-    DOC "Path to Intel IPP Async interface headers")
-
-    find_file(
-    IPP_A_LIBRARIES
-    NAMES ipp_async_preview.lib
-    PATHS $ENV{IPP_ASYNC_ROOT}
-    PATH_SUFFIXES lib/ia32
-    DOC "Path to Intel IPP Async interface libraries")
-endif()
-
-if(IPP_A_INCLUDE_DIR AND IPP_A_LIBRARIES)
-    set(HAVE_IPP_A TRUE)
-else()
-    set(HAVE_IPP_A FALSE)
-    message(WARNING "Intel IPP Async library directory (set by IPP_A_LIBRARIES_DIR variable) is not found or does not have Intel IPP Async libraries.")
-endif()
-
-mark_as_advanced(FORCE IPP_A_LIBRARIES IPP_A_INCLUDE_DIR)
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@ -35,17 +35,6 @@ if(WITH_IPP)
  endif()
 endif()

-# --- IPP Async ---
-
-if(WITH_IPP_A)
-  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIPPAsync.cmake")
-  if(IPP_A_INCLUDE_DIR AND IPP_A_LIBRARIES)
-    ocv_include_directories(${IPP_A_INCLUDE_DIR})
-    link_directories(${IPP_A_LIBRARIES})
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${IPP_A_LIBRARIES})
-   endif()
-endif(WITH_IPP_A)
-
 # --- CUDA ---
 if(WITH_CUDA)
  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCUDA.cmake")
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@ -103,9 +103,6 @@
 #cmakedefine HAVE_IPP_ICV
 #cmakedefine HAVE_IPP_IW

-/* Intel IPP Async */
-#cmakedefine HAVE_IPP_A
-
 /* JPEG-2000 codec */
 #cmakedefine HAVE_JASPER

--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -227,7 +227,6 @@ SEARCH_INCLUDES        = YES
 INCLUDE_PATH           =
 INCLUDE_FILE_PATTERNS  =
 PREDEFINED             = __cplusplus=1 \
-                         HAVE_IPP_A=1 \
                         CVAPI(x)=x \
                         CV_DOXYGEN= \
                         CV_EXPORTS= \
--- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@ -1,7 +1,7 @@
 How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_use_OpenCV_parallel_for_}
 ==================================================================

-@prev_tutorial{tutorial_how_to_use_ippa_conversion}
+@prev_tutorial{tutorial_interoperability_with_OpenCV_1}

 Goal
 ----
--- a/doc/tutorials/core/how_to_use_ippa_conversion/how_to_use_ippa_conversion.markdown
+++ b/doc/tutorials/core/how_to_use_ippa_conversion/how_to_use_ippa_conversion.markdown
@ -1,146 +0,0 @@
-Intel® IPP Asynchronous C/C++ library in OpenCV {#tutorial_how_to_use_ippa_conversion}
-===============================================
-
-@prev_tutorial{tutorial_interoperability_with_OpenCV_1}
-@next_tutorial{tutorial_how_to_use_OpenCV_parallel_for_}
-
-Goal
----
-
-The tutorial demonstrates the [Intel® IPP Asynchronous
-C/C++](http://software.intel.com/en-us/intel-ipp-preview) library usage with OpenCV. The code
-example below illustrates implementation of the Sobel operation, accelerated with Intel® IPP
-Asynchronous C/C++ functions. In this code example, @ref cv::hpp::getMat and @ref cv::hpp::getHpp
-functions are used for data conversion between
-[hppiMatrix](http://software.intel.com/en-us/node/501660) and Mat matrices.
-
-Code
----
-
-You may also find the source code in the
-`samples/cpp/tutorial_code/core/ippasync/ippasync_sample.cpp` file of the OpenCV source library or
-download it from [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/core/ippasync/ippasync_sample.cpp).
-
-@include cpp/tutorial_code/core/ippasync/ippasync_sample.cpp
-
-Explanation
-----------
-
-#  Create parameters for OpenCV:
-    @code{.cpp}
-    VideoCapture cap;
-    Mat image, gray, result;
-    @endcode
-    and IPP Async:
-    @code{.cpp}
-    hppiMatrix* src,* dst;
-    hppAccel accel = 0;
-    hppAccelType accelType;
-    hppStatus sts;
-    hppiVirtualMatrix * virtMatrix;
-    @endcode
-#  Load input image or video. How to open and read video stream you can see in the
-    @ref tutorial_video_input_psnr_ssim tutorial.
-    @code{.cpp}
-    if( useCamera )
-    {
-       printf("used camera\n");
-       cap.open(0);
-    }
-    else
-    {
-       printf("used image %s\n", file.c_str());
-       cap.open(file.c_str());
-    }
-
-    if( !cap.isOpened() )
-    {
-       printf("can not open camera or video file\n");
-       return -1;
-    }
-    @endcode
-#  Create accelerator instance using
-    [hppCreateInstance](http://software.intel.com/en-us/node/501686):
-    @code{.cpp}
-    accelType = sAccel == "cpu" ? HPP_ACCEL_TYPE_CPU:
-                sAccel == "gpu" ? HPP_ACCEL_TYPE_GPU:
-                                  HPP_ACCEL_TYPE_ANY;
-
-    //Create accelerator instance
-    sts = hppCreateInstance(accelType, 0, &accel);
-    CHECK_STATUS(sts, "hppCreateInstance");
-    @endcode
-#  Create an array of virtual matrices using
-    [hppiCreateVirtualMatrices](http://software.intel.com/en-us/node/501700) function.
-    @code{.cpp}
-    virtMatrix = hppiCreateVirtualMatrices(accel, 1);
-    @endcode
-#  Prepare a matrix for input and output data:
-    @code{.cpp}
-    cap >> image;
-    if(image.empty())
-       break;
-
-    cvtColor( image, gray, COLOR_BGR2GRAY );
-
-    result.create( image.rows, image.cols, CV_8U);
-    @endcode
-#  Convert Mat to [hppiMatrix](http://software.intel.com/en-us/node/501660) using @ref cv::hpp::getHpp
-    and call [hppiSobel](http://software.intel.com/en-us/node/474701) function.
-    @code{.cpp}
-    //convert Mat to hppiMatrix
-    src = getHpp(gray, accel);
-    dst = getHpp(result, accel);
-
-    sts = hppiSobel(accel,src, HPP_MASK_SIZE_3X3,HPP_NORM_L1,virtMatrix[0]);
-    CHECK_STATUS(sts,"hppiSobel");
-
-    sts = hppiConvert(accel, virtMatrix[0], 0, HPP_RND_MODE_NEAR, dst, HPP_DATA_TYPE_8U);
-    CHECK_STATUS(sts,"hppiConvert");
-
-    // Wait for tasks to complete
-    sts = hppWait(accel, HPP_TIME_OUT_INFINITE);
-    CHECK_STATUS(sts, "hppWait");
-    @endcode
-    We use [hppiConvert](http://software.intel.com/en-us/node/501746) because
-    [hppiSobel](http://software.intel.com/en-us/node/474701) returns destination matrix with
-    HPP_DATA_TYPE_16S data type for source matrix with HPP_DATA_TYPE_8U type. You should check
-    hppStatus after each call IPP Async function.
-
-#  Create windows and show the images, the usual way.
-    @code{.cpp}
-    imshow("image", image);
-    imshow("rez", result);
-
-    waitKey(15);
-    @endcode
-#  Delete hpp matrices.
-    @code{.cpp}
-    sts =  hppiFreeMatrix(src);
-    CHECK_DEL_STATUS(sts,"hppiFreeMatrix");
-
-    sts =  hppiFreeMatrix(dst);
-    CHECK_DEL_STATUS(sts,"hppiFreeMatrix");
-    @endcode
-#  Delete virtual matrices and accelerator instance.
-    @code{.cpp}
-    if (virtMatrix)
-    {
-       sts = hppiDeleteVirtualMatrices(accel, virtMatrix);
-       CHECK_DEL_STATUS(sts,"hppiDeleteVirtualMatrices");
-    }
-
-    if (accel)
-    {
-       sts = hppDeleteInstance(accel);
-       CHECK_DEL_STATUS(sts, "hppDeleteInstance");
-    }
-    @endcode
-
-Result
------
-
-After compiling the code above we can execute it giving an image or video path and accelerator type
-as an argument. For this tutorial we use baboon.png image as input. The result is below.
-
-![](images/How_To_Use_IPPA_Result.jpg)
--- a/doc/tutorials/core/how_to_use_ippa_conversion/images/How_To_Use_IPPA_Result.jpg
+++ b/doc/tutorials/core/how_to_use_ippa_conversion/images/How_To_Use_IPPA_Result.jpg
--- a/doc/tutorials/core/images/How_To_Use_IPPA.jpg
+++ b/doc/tutorials/core/images/How_To_Use_IPPA.jpg
--- a/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
+++ b/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
@ -2,7 +2,7 @@ Interoperability with OpenCV 1 {#tutorial_interoperability_with_OpenCV_1}
 ==============================

@prev_tutorial{tutorial_file_input_output_with_xml_yml}
-@next_tutorial{tutorial_how_to_use_ippa_conversion}
+@next_tutorial{tutorial_how_to_use_OpenCV_parallel_for_}

 Goal
 ----
--- a/doc/tutorials/core/table_of_content_core.markdown
+++ b/doc/tutorials/core/table_of_content_core.markdown
@ -93,15 +93,6 @@ understanding how to manipulate the images on a pixel level.
    Look here to shed light on all this questions.


-   @subpage tutorial_how_to_use_ippa_conversion
-
-    *Compatibility:* \> OpenCV 2.0
-
-    *Author:* Elena Gvozdeva
-
-    You will see how to use the IPP Async with OpenCV.
-
-
 -   @subpage tutorial_how_to_use_OpenCV_parallel_for_

    *Compatibility:* \>= OpenCV 2.4.3
--- a/doc/tutorials/imgproc/motion_deblur_filter/images/black_car.jpg
+++ b/doc/tutorials/imgproc/motion_deblur_filter/images/black_car.jpg
--- a/doc/tutorials/imgproc/motion_deblur_filter/images/motion_original.jpg
+++ b/doc/tutorials/imgproc/motion_deblur_filter/images/motion_original.jpg
--- a/doc/tutorials/imgproc/motion_deblur_filter/images/motion_psf.png
+++ b/doc/tutorials/imgproc/motion_deblur_filter/images/motion_psf.png
--- a/doc/tutorials/imgproc/motion_deblur_filter/images/white_car.jpg
+++ b/doc/tutorials/imgproc/motion_deblur_filter/images/white_car.jpg
--- a/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/motion_deblur_filter/motion_deblur_filter.markdown
@ -0,0 +1,72 @@
+Motion Deblur Filter {#tutorial_motion_deblur_filter}
+==========================
+
+Goal
+----
+
+In this tutorial you will learn:
+
+-   what the PSF of a motion blur image is
+-   how to restore a motion blur image
+
+Theory
+------
+
+For the degradation image model theory and the Wiener filter theory you can refer to the tutorial @ref tutorial_out_of_focus_deblur_filter "Out-of-focus Deblur Filter".
+On this page only a linear motion blur distortion is considered. The motion blur image on this page is a real world image. The blur was caused by a moving subject.
+
+### What is the PSF of a motion blur image?
+
+The point spread function (PSF) of a linear motion blur distortion is a line segment. Such a PSF is specified by two parameters: \f$LEN\f$ is the length of the blur and \f$THETA\f$ is the angle of motion.
+
+![Point spread function of a linear motion blur distortion](images/motion_psf.png)
+
+### How to restore a blurred image?
+
+On this page the Wiener filter is used as the restoration filter, for details you can refer to the tutorial @ref tutorial_out_of_focus_deblur_filter "Out-of-focus Deblur Filter".
+In order to synthesize the Wiener filter for a motion blur case, it needs to specify the signal-to-noise ratio (\f$SNR\f$), \f$LEN\f$ and \f$THETA\f$ of the PSF.
+
+Source code
+-----------
+
+You can find source code in the `samples/cpp/tutorial_code/ImgProc/motion_deblur_filter/motion_deblur_filter.cpp` of the OpenCV source code library.
+
+@include cpp/tutorial_code/ImgProc/motion_deblur_filter/motion_deblur_filter.cpp
+
+Explanation
+-----------
+
+A motion blur image recovering algorithm consists of PSF generation, Wiener filter generation and filtering a blurred image in a frequency domain:
+@snippet samples/cpp/tutorial_code/ImgProc/motion_deblur_filter/motion_deblur_filter.cpp main
+
+A function calcPSF() forms a PSF according to input parameters \f$LEN\f$ and \f$THETA\f$ (in degrees):
+@snippet samples/cpp/tutorial_code/ImgProc/motion_deblur_filter/motion_deblur_filter.cpp calcPSF
+
+A function edgetaper() tapers the input image’s edges in order to reduce the ringing effect in a restored image:
+@snippet samples/cpp/tutorial_code/ImgProc/motion_deblur_filter/motion_deblur_filter.cpp edgetaper
+
+The functions calcWnrFilter(), fftshift() and filter2DFreq() realize an image filtration by a specified PSF in the frequency domain.  The functions are copied from the tutorial
+@ref tutorial_out_of_focus_deblur_filter "Out-of-focus Deblur Filter".
+
+Result
+------
+
+Below you can see the real world image with motion blur distortion. The license plate is not readable on both cars. The red markers show the car’s license plate location.
+![Motion blur image. The license plates are not readable](images/motion_original.jpg)
+
+
+Below you can see the restoration result for the black car license plate. The result has been computed with \f$LEN\f$ = 125, \f$THETA\f$ = 0, \f$SNR\f$ = 700.
+![The restored image of the black car license plate](images/black_car.jpg)
+
+Below you can see the restoration result for the white car license plate. The result has been computed with \f$LEN\f$ = 78, \f$THETA\f$ = 15, \f$SNR\f$ = 300.
+![The restored image of the white car license plate](images/white_car.jpg)
+
+The values of \f$SNR\f$, \f$LEN\f$ and \f$THETA\f$ were selected manually to give the best possible visual result. The \f$THETA\f$ parameter coincides with the car’s moving direction, and the
+\f$LEN\f$ parameter depends on the car’s moving speed.
+The result is not perfect, but at least it gives us a hint of the image’s content. With some effort, the car license plate is now readable.
+
+@note The parameters \f$LEN\f$ and \f$THETA\f$ are the most important. You should adjust \f$LEN\f$ and \f$THETA\f$ first, then \f$SNR\f$.
+
+You can also find a quick video demonstration of a license plate recovering method
+[YouTube](https://youtu.be/xSrE0hdhb4o).
+@youtube{xSrE0hdhb4o}
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
@ -8,54 +8,54 @@ Goal

 In this tutorial you will learn:

-   what is a degradation image model
-   what is PSF of out-of-focus image
+-   what a degradation image model is
+-   what the PSF of an out-of-focus image is
 -   how to restore a blurred image
-   what is Wiener filter
+-   what is a Wiener filter

 Theory
 ------

-@note The explanation is based on the books @cite gonzalez and @cite gruzman. Also, you can refer to Matlab's tutorial [Image Deblurring in Matlab] and an article [SmartDeblur].
-@note An out-of-focus image on this page is a real world  image. An out-of-focus was done manually by camera optics.
+@note The explanation is based on the books @cite gonzalez and @cite gruzman. Also, you can refer to Matlab's tutorial [Image Deblurring in Matlab] and the article [SmartDeblur].
+@note The out-of-focus image on this page is a real world  image. The out-of-focus was achieved manually by camera optics.

 ### What is a degradation image model?

-A mathematical model of the image degradation in frequency domain representation is:
+Here is a mathematical model of the image degradation in frequency domain representation:

 \f[S = H\cdot U + N\f]

 where
 \f$S\f$ is a spectrum of blurred (degraded) image,
 \f$U\f$ is a spectrum of original true (undegraded) image,
-\f$H\f$ is frequency response of point spread function (PSF),
+\f$H\f$ is a frequency response of point spread function (PSF),
 \f$N\f$ is a spectrum of additive noise.

-Circular PSF is a good approximation of out-of-focus distortion. Such PSF is specified by only one parameter - radius \f$R\f$. Circular PSF is used in this work.
+The circular PSF is a good approximation of out-of-focus distortion. Such a PSF is specified by only one parameter - radius \f$R\f$. Circular PSF is used in this work.

 ![Circular point spread function](psf.png)

-### How to restore an blurred image?
+### How to restore a blurred image?

-The objective of restoration (deblurring) is to obtain an estimate of the original image. Restoration formula in frequency domain is:
+The objective of restoration (deblurring) is to obtain an estimate of the original image. The restoration formula in frequency domain is:

 \f[U' = H_w\cdot S\f]

 where
-\f$U'\f$ is spectrum of estimation of original image \f$U\f$,
-\f$H_w\f$ is restoration filter, for example, Wiener filter.
+\f$U'\f$ is the spectrum of estimation of original image \f$U\f$, and
+\f$H_w\f$ is the restoration filter, for example, the Wiener filter.

-### What is Wiener filter?
+### What is the Wiener filter?

-Wiener filter is a way to restore a blurred image. Let's suppose that PSF is a real and symmetric signal, a power spectrum of the original true image and noise are not known,
-then simplified Wiener formula is:
+The Wiener filter is a way to restore a blurred image. Let's suppose that the PSF is a real and symmetric signal, a power spectrum of the original true image and noise are not known,
+then a simplified Wiener formula is:

 \f[H_w = \frac{H}{|H|^2+\frac{1}{SNR}} \f]

 where
 \f$SNR\f$ is signal-to-noise ratio.

-So, in order to recover an out-of-focus image by Wiener filter, it needs to know \f$SNR\f$ and \f$R\f$ of circular PSF.
+So, in order to recover an out-of-focus image by Wiener filter, it needs to know the \f$SNR\f$ and \f$R\f$ of the circular PSF.


 Source code
@ -68,36 +68,36 @@ You can find source code in the `samples/cpp/tutorial_code/ImgProc/out_of_focus_
 Explanation
 -----------

-An out-of-focus image recovering algorithm consists of PSF generation, Wiener filter generation and filtering an blurred image in frequency domain:
+An out-of-focus image recovering algorithm consists of PSF generation, Wiener filter generation and filtering a blurred image in frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp main

-A function calcPSF() forms an circular PSF according to input parameter radius \f$R\f$:
+A function calcPSF() forms a circular PSF according to input parameter radius \f$R\f$:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcPSF

-A function calcWnrFilter() synthesizes simplified Wiener filter \f$H_w\f$ according to formula described above:
+A function calcWnrFilter() synthesizes the simplified Wiener filter \f$H_w\f$ according to the formula described above:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp calcWnrFilter

-A function fftshift() rearranges PSF. This code was just copied from tutorial @ref tutorial_discrete_fourier_transform "Discrete Fourier Transform":
+A function fftshift() rearranges the PSF. This code was just copied from the tutorial @ref tutorial_discrete_fourier_transform "Discrete Fourier Transform":
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp fftshift

-A function filter2DFreq() filters an blurred image in frequency domain:
+A function filter2DFreq() filters the blurred image in the frequency domain:
@snippet samples/cpp/tutorial_code/ImgProc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.cpp filter2DFreq

 Result
 ------

-Below you can see real out-of-focus image:
+Below you can see the real out-of-focus image:
 ![Out-of-focus image](images/original.jpg)


-Below result was done by \f$R\f$ = 53 and \f$SNR\f$ = 5200 parameters:
+And the following result has been computed with \f$R\f$ = 53 and \f$SNR\f$ = 5200 parameters:
 ![The restored (deblurred) image](images/recovered.jpg)

-The Wiener filter was used, values of \f$R\f$ and \f$SNR\f$ were selected manually to give the best possible visual result.
-We can see that the result is not perfect, but it gives us a hint to the image content. With some difficulty, the text is readable.
+The Wiener filter was used, and values of \f$R\f$ and \f$SNR\f$ were selected manually to give the best possible visual result.
+We can see that the result is not perfect, but it gives us a hint to the image's content. With some difficulty, the text is readable.

@note The parameter \f$R\f$ is the most important. So you should adjust \f$R\f$ first, then \f$SNR\f$.
-@note Sometimes you can observe the ringing effect in an restored image. This effect can be reduced by several methods. For example, you can taper input image edges.
+@note Sometimes you can observe the ringing effect in a restored image. This effect can be reduced with several methods. For example, you can taper input image edges.

 You can also find a quick video demonstration of this on
 [YouTube](https://youtu.be/0bEcE4B0XP4).
--- a/doc/tutorials/imgproc/table_of_content_imgproc.markdown
+++ b/doc/tutorials/imgproc/table_of_content_imgproc.markdown
@ -320,3 +320,13 @@ In this section you will learn about the image processing (manipulation) functio
    *Author:* Karpushin Vladislav

    You will learn how to recover an out-of-focus image by Wiener filter.
+
+-   @subpage tutorial_motion_deblur_filter
+
+    *Languages:* C++
+
+    *Compatibility:* \> OpenCV 2.0
+
+    *Author:* Karpushin Vladislav
+
+    You will learn how to recover an image with motion blur distortion using a Wiener filter.
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@ -142,8 +142,6 @@ of them, you need to download and install them on your system.
 -   [Intel Integrated Performance Primitives (*IPP*)](http://software.intel.com/en-us/articles/intel-ipp/) may be used to improve the performance
    of color conversion, Haar training and DFT functions of the OpenCV library. Watch out, since
    this is not a free service.
-   [Intel IPP Asynchronous C/C++](http://software.intel.com/en-us/intel-ipp-preview) is currently focused delivering Intel Graphics
-    support for advanced image processing and computer vision functions.
 -   OpenCV offers a somewhat fancier and more useful graphical user interface, than the default one
    by using the [Qt framework](http://qt.nokia.com/downloads). For a quick overview of what this has to offer, look into the
    documentations *highgui* module, under the *Qt New Functions* section. Version 4.6 or later of
@ -204,10 +202,6 @@ libraries). If you do not need the support for some of these, you can just freel

        ![](images/IntelTBB.png)

-    -#  For the [Intel IPP Asynchronous C/C++](http://software.intel.com/en-us/intel-ipp-preview) download the source files and set environment
-        variable **IPP_ASYNC_ROOT**. It should point to
-        `<your Program Files(x86) directory>/Intel/IPP Preview */ipp directory`. Here \* denotes the
-        particular preview name.
    -#  In case of the [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page#Download) library it is again a case of download and extract to the
        `D:/OpenCV/dep` directory.
    -#  Same as above with [OpenEXR](http://www.openexr.com/downloads.html).
@ -319,6 +313,7 @@ libraries). If you do not need the support for some of these, you can just freel
        you are concerned about performance, build them and run.
    -   *BUILD_opencv_python* -\> Self-explanatory. Create the binaries to use OpenCV from the
        Python language.
+    -   *BUILD_opencv_world* -\> Generate a single "opencv_world" binary (a shared or static library, depending on *BUILD_SHARED_LIBS*) including all the modules instead of a collection of separate binaries, one binary per module.

    Press again the *Configure* button and ensure no errors are reported. If this is the case, you
    can tell CMake to create the project files by pushing the *Generate* button. Go to the build
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@ -1487,12 +1487,14 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
    size_t minLrSize = width1 , LrSize = minLrSize*D2;
    int hsumBufNRows = SH2*2 + 2;
    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*hsumBufNRows*sizeof(CostType) +                       // hsumBuf
-    CSBufSize*2*sizeof(CostType) + 1024;                              // C, S
+                          costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf
+                          CSBufSize*2*sizeof(CostType) + 1024;        // C, S

    if( buffer.empty() || !buffer.isContinuous() ||
        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.create(1, (int)totalBufSize, CV_8U);
+    {
+        buffer.reserveBuffer(totalBufSize);
+    }

    // summary cost over different (nDirs) directions
    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -664,6 +664,8 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
 }

+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }

 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -891,6 +891,20 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
    }
 }

+/** @brief Multiply and extract high part
+
+Multiply values two registers and store high part of the results.
+Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
+    return c;
+}
+
 //! @cond IGNORED
 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -553,6 +553,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
 }

+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
+                                 ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
+                                  ));
+}
+
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -737,6 +737,9 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    d.val = _mm_unpackhi_epi64(c0, c1);
 }

+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
+
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    return v_int32x4(_mm_madd_epi16(a.val, b.val));
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -457,6 +457,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
 }

+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vec_packs(
+                               vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)),
+                               vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16))
+                              ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vec_packs(
+                                vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)),
+                                vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16))
+                               ));
+}
+
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin)    \
 template<typename _Tpvec>                             \
--- a/modules/core/include/opencv2/core/ippasync.hpp
+++ b/modules/core/include/opencv2/core/ippasync.hpp
@ -45,7 +45,7 @@
 #ifndef OPENCV_CORE_IPPASYNC_HPP
 #define OPENCV_CORE_IPPASYNC_HPP

-#ifdef HAVE_IPP_A
+#ifdef HAVE_IPP_A  // this file will be removed in OpenCV 4.0

 #include "opencv2/core.hpp"
 #include <ipp_async_op.h>
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -146,6 +146,12 @@ synonym is needed to generate Python/Java etc. wrappers properly. At the functio
 level their use is similar, but _InputArray::getMat(idx) should be used to get header for the
 idx-th component of the outer vector and _InputArray::size().area() should be used to find the
 number of components (vectors/matrices) of the outer vector.
+
+In general, type support is limited to cv::Mat types. Other types are forbidden.
+But in some cases we need to support passing of custom non-general Mat types, like arrays of cv::KeyPoint, cv::DMatch, etc.
+This data is not intented to be interpreted as an image data, or processed somehow like regular cv::Mat.
+To pass such custom type use rawIn() / rawOut() / rawInOut() wrappers.
+Custom type is wrapped as Mat-compatible `CV_8UC<N>` values (N = sizeof(T), N <= CV_CN_MAX).
 */
 class CV_EXPORTS _InputArray
 {
@ -199,6 +205,9 @@ public:
    template<typename _Tp, std::size_t _Nm> _InputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _InputArray(const std::array<Mat, _Nm>& arr);

+    template<typename _Tp> static _InputArray rawIn(const std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _InputArray rawIn(const std::array<_Tp, _Nm>& arr);
+
    Mat getMat(int idx=-1) const;
    Mat getMat_(int idx=-1) const;
    UMat getUMat(int idx=-1) const;
@ -328,12 +337,13 @@ public:
    _OutputArray(const UMat& m);
    _OutputArray(const std::vector<UMat>& vec);

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> _OutputArray(std::array<_Tp, _Nm>& arr);
    template<typename _Tp, std::size_t _Nm> _OutputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _OutputArray(std::array<Mat, _Nm>& arr);
    template<std::size_t _Nm> _OutputArray(const std::array<Mat, _Nm>& arr);
-#endif
+
+    template<typename _Tp> static _OutputArray rawOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _OutputArray rawOut(std::array<_Tp, _Nm>& arr);

    bool fixedSize() const;
    bool fixedType() const;
@ -397,15 +407,23 @@ public:
    _InputOutputArray(const UMat& m);
    _InputOutputArray(const std::vector<UMat>& vec);

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> _InputOutputArray(std::array<_Tp, _Nm>& arr);
    template<typename _Tp, std::size_t _Nm> _InputOutputArray(const std::array<_Tp, _Nm>& arr);
    template<std::size_t _Nm> _InputOutputArray(std::array<Mat, _Nm>& arr);
    template<std::size_t _Nm> _InputOutputArray(const std::array<Mat, _Nm>& arr);
-#endif
+
+    template<typename _Tp> static _InputOutputArray rawInOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray rawInOut(std::array<_Tp, _Nm>& arr);

 };

+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v);
+
 CV__DEBUG_NS_END

 typedef const _InputArray& InputArray;
@ -991,11 +1009,9 @@ public:
    */
    template<typename _Tp> explicit Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list);

-#ifdef CV_CXX_STD_ARRAY
    /** @overload
    */
    template<typename _Tp, size_t _Nm> explicit Mat(const std::array<_Tp, _Nm>& arr, bool copyData=false);
-#endif

    /** @overload
    */
@ -1630,9 +1646,7 @@ public:
    template<typename _Tp, int n> operator Vec<_Tp, n>() const;
    template<typename _Tp, int m, int n> operator Matx<_Tp, m, n>() const;

-#ifdef CV_CXX_STD_ARRAY
    template<typename _Tp, std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
-#endif

    /** @brief Reports whether the matrix is continuous or not.

@ -2214,9 +2228,7 @@ public:
    Mat_(std::initializer_list<_Tp> values);
    explicit Mat_(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> values);

-#ifdef CV_CXX_STD_ARRAY
    template <std::size_t _Nm> explicit Mat_(const std::array<_Tp, _Nm>& arr, bool copyData=false);
-#endif

    Mat_& operator = (const Mat& m);
    Mat_& operator = (const Mat_& m);
@ -2314,10 +2326,8 @@ public:
    //! conversion to vector.
    operator std::vector<_Tp>() const;

-#ifdef CV_CXX_STD_ARRAY
    //! conversion to array.
    template<std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
-#endif

    //! conversion to Vec
    template<int n> operator Vec<typename DataType<_Tp>::channel_type, n>() const;
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -61,6 +61,16 @@ CV__DEBUG_NS_BEGIN

 //! @cond IGNORED

+////////////////////////// Custom (raw) type wrapper //////////////////////////
+
+template<typename _Tp> static inline
+int rawType()
+{
+    CV_StaticAssert(sizeof(_Tp) <= CV_CN_MAX, "sizeof(_Tp) is too large");
+    const int elemSize = sizeof(_Tp);
+    return (int)CV_MAKETYPE(CV_8U, elemSize);
+}
+
 //////////////////////// Input/Output Arrays ////////////////////////

 inline void _InputArray::init(int _flags, const void* _obj)
@ -134,6 +144,25 @@ inline _InputArray::_InputArray(const ogl::Buffer& buf)
 inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
 { init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }

+template<typename _Tp> inline
+_InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
+{
+    _InputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_READ;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
+{
+    _InputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
 inline _InputArray::~_InputArray() {}

 inline Mat _InputArray::getMat(int i) const
@ -261,6 +290,25 @@ inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
 inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
 { init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }

+template<typename _Tp> inline
+_OutputArray _OutputArray::rawOut(std::vector<_Tp>& vec)
+{
+    _OutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_WRITE;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
+{
+    _OutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////

 inline _InputOutputArray::_InputOutputArray() { init(ACCESS_RW, 0); }
@ -370,6 +418,30 @@ inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
 inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
 { init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }

+template<typename _Tp> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::vector<_Tp>& vec)
+{
+    _InputOutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_RW;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
+{
+    _InputOutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v) { return _InputArray::rawIn(v); }
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v) { return _OutputArray::rawOut(v); }
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v) { return _InputOutputArray::rawInOut(v); }
+
 CV__DEBUG_NS_END

 //////////////////////////////////////////// Mat //////////////////////////////////////////
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -270,7 +270,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
-        uchar* ptrs[4]{};
+        uchar* ptrs[4] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
@ -306,7 +306,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
    else
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
-        uchar* ptrs[3]{};
+        uchar* ptrs[3] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
@ -745,7 +745,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
-        uchar* ptrs[4]{};
+        uchar* ptrs[4] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
@ -812,7 +812,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    else
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
-        uchar* ptrs[3]{};
+        uchar* ptrs[3] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
@ -1240,7 +1240,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, 0 };
-        uchar* ptrs[3]{};
+        uchar* ptrs[3] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size;
@ -1251,7 +1251,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
    else
    {
        const Mat* arrays[] = { &src1, &dst, 0 };
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};

        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
@ -1748,7 +1748,7 @@ void cv::inRange(InputArray _src, InputArray _lowerb,

    const Mat* arrays_sc[] = { &src, &dst, 0 };
    const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
-    uchar* ptrs[4]{};
+    uchar* ptrs[4] = {};

    NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
    size_t total = it.size, blocksize = std::min(total, blocksize0);
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -1347,7 +1347,7 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
    else
    {
        const Mat* arrays[] = {&src, &dst, 0};
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        Size sz((int)(it.size*cn), 1);

@ -1496,7 +1496,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
    else
    {
        const Mat* arrays[] = {&src, &dst, 0};
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        Size sz((int)(it.size*cn), 1);

--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@ -1775,7 +1775,7 @@ void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, doubl
    else
    {
        const Mat* arrays[] = {&src, &dst, 0};
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        Size sz((int)it.size*cn, 1);

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -306,7 +306,7 @@ void Mat::copyTo( OutputArray _dst ) const
    if( total() != 0 )
    {
        const Mat* arrays[] = { this, &dst };
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs, 2);
        size_t sz = it.size*elemSize();

@ -399,7 +399,7 @@ void Mat::copyTo( OutputArray _dst, InputArray _mask ) const
    }

    const Mat* arrays[] = { this, &dst, &mask, 0 };
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    Size sz((int)(it.size*mcn), 1);

--- a/modules/core/src/count_non_zero.cpp
+++ b/modules/core/src/count_non_zero.cpp
@ -25,51 +25,34 @@ static int countNonZero_(const T* src, int len )
 static int countNonZero8u( const uchar* src, int len )
 {
    int i=0, nz = 0;
-#if CV_SSE2
-    if(USE_SSE2)//5x-6x
+#if CV_SIMD
+    int len0 = len & -v_uint8::nlanes;
+    v_uint8 v_zero = vx_setzero_u8();
+    v_uint8 v_one = vx_setall_u8(1);
+
+    v_uint32 v_sum32 = vx_setzero_u32();
+    while (i < len0)
    {
-        __m128i v_zero = _mm_setzero_si128();
-        __m128i sum = _mm_setzero_si128();
-
-        for (; i<=len-16; i+=16)
+        v_uint16 v_sum16 = vx_setzero_u16();
+        int j = i;
+        while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
        {
-            __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
-            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero));
+            v_uint8 v_sum8 = vx_setzero_u8();
+            int k = j;
+            for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
+                v_sum8 += v_one & (vx_load(src + k) == v_zero);
+            v_uint16 part1, part2;
+            v_expand(v_sum8, part1, part2);
+            v_sum16 += part1 + part2;
+            j = k;
        }
-        nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum)));
+        v_uint32 part1, part2;
+        v_expand(v_sum16, part1, part2);
+        v_sum32 += part1 + part2;
+        i = j;
    }
-#elif CV_NEON
-    int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
-    uint32x4_t v_nz = vdupq_n_u32(0u);
-    uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
-    const uchar * src0 = src;
-
-    while( i < len0 )
-    {
-        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
-
-        while (j < blockSizei)
-        {
-            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
-            uint8x16_t v_pz = v_zero;
-
-            for( ; k <= blockSizej - 16; k += 16 )
-                v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
-
-            uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
-            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
-            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
-
-            src0 += blockSizej;
-            j += blockSizej;
-        }
-
-        i += blockSizei;
-    }
-
-    CV_DECL_ALIGNED(16) unsigned int buf[4];
-    vst1q_u32(buf, v_nz);
-    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+    nz = i - v_reduce_sum(v_sum32);
+    v_cleanup();
 #endif
    for( ; i < len; i++ )
        nz += src[i] != 0;
@ -79,159 +62,112 @@ static int countNonZero8u( const uchar* src, int len )
 static int countNonZero16u( const ushort* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SSE2
-    if (USE_SSE2)
+#if CV_SIMD
+    int len0 = len & -v_int8::nlanes;
+    v_uint16 v_zero = vx_setzero_u16();
+    v_int8 v_one = vx_setall_s8(1);
+
+    v_int32 v_sum32 = vx_setzero_s32();
+    while (i < len0)
    {
-        __m128i v_zero = _mm_setzero_si128 ();
-        __m128i sum = _mm_setzero_si128();
-
-        for ( ; i <= len - 8; i += 8)
+        v_int16 v_sum16 = vx_setzero_s16();
+        int j = i;
+        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
        {
-            __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
-            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero));
+            v_int8 v_sum8 = vx_setzero_s8();
+            int k = j;
+            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
+                v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
+            v_int16 part1, part2;
+            v_expand(v_sum8, part1, part2);
+            v_sum16 += part1 + part2;
+            j = k;
        }
-
-        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1);
-        src += i;
+        v_int32 part1, part2;
+        v_expand(v_sum16, part1, part2);
+        v_sum32 += part1 + part2;
+        i = j;
    }
-#elif CV_NEON
-    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
-    uint32x4_t v_nz = vdupq_n_u32(0u);
-    uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
-
-    while( i < len0 )
-    {
-        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
-
-        while (j < blockSizei)
-        {
-            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
-            uint16x8_t v_pz = v_zero;
-
-            for( ; k <= blockSizej - 8; k += 8 )
-                v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
-
-            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
-
-            src += blockSizej;
-            j += blockSizej;
-        }
-
-        i += blockSizei;
-    }
-
-    CV_DECL_ALIGNED(16) unsigned int buf[4];
-    vst1q_u32(buf, v_nz);
-    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+    nz = i - v_reduce_sum(v_sum32);
+    v_cleanup();
 #endif
-    return nz + countNonZero_(src, len - i);
+    return nz + countNonZero_(src + i, len - i);
 }

 static int countNonZero32s( const int* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SSE2
-    if (USE_SSE2)
+#if CV_SIMD
+    int len0 = len & -v_int8::nlanes;
+    v_int32 v_zero = vx_setzero_s32();
+    v_int8 v_one = vx_setall_s8(1);
+
+    v_int32 v_sum32 = vx_setzero_s32();
+    while (i < len0)
    {
-        __m128i v_zero = _mm_setzero_si128 ();
-        __m128i sum = _mm_setzero_si128();
-
-        for ( ; i <= len - 4; i += 4)
+        v_int16 v_sum16 = vx_setzero_s16();
+        int j = i;
+        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
        {
-            __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
-            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero));
+            v_int8 v_sum8 = vx_setzero_s8();
+            int k = j;
+            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
+                v_sum8 += v_one & v_pack(
+                    v_pack(vx_load(src + k                    ) == v_zero, vx_load(src + k +   v_int32::nlanes) == v_zero),
+                    v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
+                );
+            v_int16 part1, part2;
+            v_expand(v_sum8, part1, part2);
+            v_sum16 += part1 + part2;
+            j = k;
        }
-
-        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
-        src += i;
+        v_int32 part1, part2;
+        v_expand(v_sum16, part1, part2);
+        v_sum32 += part1 + part2;
+        i = j;
    }
-#elif CV_NEON
-    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
-    uint32x4_t v_nz = vdupq_n_u32(0u);
-    int32x4_t v_zero = vdupq_n_s32(0.0f);
-    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
-
-    while( i < len0 )
-    {
-        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
-
-        while (j < blockSizei)
-        {
-            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
-            uint16x8_t v_pz = v_zerou;
-
-            for( ; k <= blockSizej - 8; k += 8 )
-                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
-                                                              vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
-
-            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
-
-            src += blockSizej;
-            j += blockSizej;
-        }
-
-        i += blockSizei;
-    }
-
-    CV_DECL_ALIGNED(16) unsigned int buf[4];
-    vst1q_u32(buf, v_nz);
-    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+    nz = i - v_reduce_sum(v_sum32);
+    v_cleanup();
 #endif
-    return nz + countNonZero_(src, len - i);
+    return nz + countNonZero_(src + i, len - i);
 }

 static int countNonZero32f( const float* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SSE2
-    if (USE_SSE2)
+#if CV_SIMD
+    int len0 = len & -v_int8::nlanes;
+    v_float32 v_zero = vx_setzero_f32();
+    v_int8 v_one = vx_setall_s8(1);
+
+    v_int32 v_sum32 = vx_setzero_s32();
+    while (i < len0)
    {
-        __m128 v_zero_f = _mm_setzero_ps();
-        __m128i v_zero = _mm_setzero_si128 ();
-        __m128i sum = _mm_setzero_si128();
-
-        for ( ; i <= len - 4; i += 4)
+        v_int16 v_sum16 = vx_setzero_s16();
+        int j = i;
+        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
        {
-            __m128 r0 = _mm_loadu_ps(src + i);
-            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero));
+            v_int8 v_sum8 = vx_setzero_s8();
+            int k = j;
+            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
+                v_sum8 += v_one & v_pack(
+                    v_pack(v_reinterpret_as_s32(vx_load(src + k                      ) == v_zero), v_reinterpret_as_s32(vx_load(src + k +   v_float32::nlanes) == v_zero)),
+                    v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
+                );
+            v_int16 part1, part2;
+            v_expand(v_sum8, part1, part2);
+            v_sum16 += part1 + part2;
+            j = k;
        }
-
-        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
-        src += i;
+        v_int32 part1, part2;
+        v_expand(v_sum16, part1, part2);
+        v_sum32 += part1 + part2;
+        i = j;
    }
-#elif CV_NEON
-    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
-    uint32x4_t v_nz = vdupq_n_u32(0u);
-    float32x4_t v_zero = vdupq_n_f32(0.0f);
-    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
-
-    while( i < len0 )
-    {
-        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
-
-        while (j < blockSizei)
-        {
-            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
-            uint16x8_t v_pz = v_zerou;
-
-            for( ; k <= blockSizej - 8; k += 8 )
-                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
-                                                              vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
-
-            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
-
-            src += blockSizej;
-            j += blockSizej;
-        }
-
-        i += blockSizei;
-    }
-
-    CV_DECL_ALIGNED(16) unsigned int buf[4];
-    vst1q_u32(buf, v_nz);
-    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+    nz = i - v_reduce_sum(v_sum32);
+    v_cleanup();
 #endif
-    return nz + countNonZero_(src, len - i);
+    return nz + countNonZero_(src + i, len - i);
 }

 static int countNonZero64f( const double* src, int len )
@ -378,7 +314,7 @@ int cv::countNonZero( InputArray _src )
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, 0};
-    uchar* ptrs[1]{};
+    uchar* ptrs[1] = {};
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, nz = 0;

--- a/modules/core/src/lpsolver.cpp
+++ b/modules/core/src/lpsolver.cpp
@ -98,6 +98,10 @@ int solveLP(const Mat& Func, const Mat& Constr, Mat& z){
    CV_Assert(Constr.type()==CV_64FC1 || Constr.type()==CV_32FC1);
    CV_Assert((Func.rows==1 && (Constr.cols-Func.cols==1))||
            (Func.cols==1 && (Constr.cols-Func.rows==1)));
+    if (!z.empty())
+        CV_CheckTypeEQ(z.type(), CV_64FC1, "");
+    else
+        CV_CheckType(z.type(), z.type() == CV_64FC1 || z.type() == CV_8UC1/*empty cv::Mat*/, "");

    //copy arguments for we will shall modify them
    Mat_<double> bigC=Mat_<double>(1,(Func.rows==1?Func.cols:Func.rows)+1),
--- a/modules/core/src/lut.cpp
+++ b/modules/core/src/lut.cpp
@ -342,7 +342,7 @@ public:
        int lutcn = lut_.channels();

        const Mat* arrays[] = {&src, &dst, 0};
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        int len = (int)it.size;

@ -408,7 +408,7 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size;

--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -158,7 +158,7 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
    Mat Mag = dst.getMat();

    const Mat* arrays[] = {&X, &Y, &Mag, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size*cn;

@ -194,7 +194,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
    Mat Angle = dst.getMat();

    const Mat* arrays[] = {&X, &Y, &Angle, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    int j, total = (int)(it.size*cn), blockSize = total;
    size_t esz1 = X.elemSize1();
@ -280,7 +280,7 @@ void cartToPolar( InputArray src1, InputArray src2,
    Mat Mag = dst1.getMat(), Angle = dst2.getMat();

    const Mat* arrays[] = {&X, &Y, &Mag, &Angle, 0};
-    uchar* ptrs[4]{};
+    uchar* ptrs[4] = {};
    NAryMatIterator it(arrays, ptrs);
    int j, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
    size_t esz1 = X.elemSize1();
@ -577,7 +577,7 @@ void polarToCart( InputArray src1, InputArray src2,
    CV_IPP_RUN(!angleInDegrees, ipp_polarToCart(Mag, Angle, X, Y));

    const Mat* arrays[] = {&Mag, &Angle, &X, &Y, 0};
-    uchar* ptrs[4]{};
+    uchar* ptrs[4] = {};
    NAryMatIterator it(arrays, ptrs);
    cv::AutoBuffer<float> _buf;
    float* buf[2] = {0, 0};
@ -676,7 +676,7 @@ void exp( InputArray _src, OutputArray _dst )
    Mat dst = _dst.getMat();

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);

@ -709,7 +709,7 @@ void log( InputArray _src, OutputArray _dst )
    Mat dst = _dst.getMat();

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);

@ -1241,7 +1241,7 @@ void pow( InputArray _src, double power, OutputArray _dst )
    Mat dst = _dst.getMat();

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);

@ -1588,7 +1588,7 @@ void patchNaNs( InputOutputArray _a, double _val )

    Mat a = _a.getMat();
    const Mat* arrays[] = {&a, 0};
-    int* ptrs[1]{};
+    int* ptrs[1] = {};
    NAryMatIterator it(arrays, (uchar**)ptrs);
    size_t len = it.size*a.channels();
    Cv32suf val;
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@ -2144,7 +2144,7 @@ void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx )
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;

@ -2290,7 +2290,7 @@ void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mt
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;

@ -2441,7 +2441,7 @@ void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray
    }

    const Mat* arrays[] = {&src1, &src2, &dst, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    size_t i, len = it.size*cn;

@ -3301,7 +3301,7 @@ double Mat::dot(InputArray _mat) const
    }

    const Mat* arrays[] = {this, &mat, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);
    double r = 0;
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@ -1413,18 +1413,39 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i,
        case 16:
            ((std::vector<Vec4i>*)v)->resize(len);
            break;
+        case 20:
+            ((std::vector<Vec<int, 5> >*)v)->resize(len);
+            break;
        case 24:
            ((std::vector<Vec6i>*)v)->resize(len);
            break;
+        case 28:
+            ((std::vector<Vec<int, 7> >*)v)->resize(len);
+            break;
        case 32:
            ((std::vector<Vec8i>*)v)->resize(len);
            break;
        case 36:
            ((std::vector<Vec<int, 9> >*)v)->resize(len);
            break;
+        case 40:
+            ((std::vector<Vec<int, 10> >*)v)->resize(len);
+            break;
+        case 44:
+            ((std::vector<Vec<int, 11> >*)v)->resize(len);
+            break;
        case 48:
            ((std::vector<Vec<int, 12> >*)v)->resize(len);
            break;
+        case 52:
+            ((std::vector<Vec<int, 13> >*)v)->resize(len);
+            break;
+        case 56:
+            ((std::vector<Vec<int, 14> >*)v)->resize(len);
+            break;
+        case 60:
+            ((std::vector<Vec<int, 15> >*)v)->resize(len);
+            break;
        case 64:
            ((std::vector<Vec<int, 16> >*)v)->resize(len);
            break;
--- a/modules/core/src/mean.cpp
+++ b/modules/core/src/mean.cpp
@ -121,7 +121,7 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
    CV_Assert( cn <= 4 && func != 0 );

    const Mat* arrays[] = {&src, &mask, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
    int j, count = 0;
@ -786,7 +786,7 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &mask, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
    int j, count = 0, nz0 = 0;
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@ -770,7 +770,7 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &mask, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    NAryMatIterator it(arrays, ptrs);

    size_t minidx = 0, maxidx = 0;
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@ -710,7 +710,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        int cellSize = normType == NORM_HAMMING ? 1 : 2;

        const Mat* arrays[] = {&src, 0};
-        uchar* ptrs[1]{};
+        uchar* ptrs[1] = {};
        NAryMatIterator it(arrays, ptrs);
        int total = (int)it.size;
        int result = 0;
@ -727,7 +727,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &mask, 0};
-    uchar* ptrs[2]{};
+    uchar* ptrs[2] = {};
    union
    {
        double d;
@ -1168,7 +1168,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        int cellSize = normType == NORM_HAMMING ? 1 : 2;

        const Mat* arrays[] = {&src1, &src2, 0};
-        uchar* ptrs[2]{};
+        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        int total = (int)it.size;
        int result = 0;
@ -1185,7 +1185,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src1, &src2, &mask, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    union
    {
        double d;
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@ -584,6 +584,11 @@ void RNG::fill( InputOutputArray _mat, int disttype,
                }
                ip[j][1] = cvCeil(a);
                int idiff = ip[j][0] = cvFloor(b) - ip[j][1] - 1;
+                if (idiff < 0)
+                {
+                    idiff = 0;
+                    ip[j][0] = 0;
+                }
                double diff = b - a;

                fast_int_mode = fast_int_mode && diff <= 4294967296. && (idiff & (idiff+1)) == 0;
--- a/modules/core/src/sum.cpp
+++ b/modules/core/src/sum.cpp
@ -602,7 +602,7 @@ cv::Scalar cv::sum( InputArray _src )
    CV_Assert( cn <= 4 && func != 0 );

    const Mat* arrays[] = {&src, 0};
-    uchar* ptrs[1]{};
+    uchar* ptrs[1] = {};
    NAryMatIterator it(arrays, ptrs);
    Scalar s;
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
--- a/modules/core/test/test_ippasync.cpp
+++ b/modules/core/test/test_ippasync.cpp
@ -1,171 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "opencv2/ts/ocl_test.hpp"
-
-#ifdef HAVE_IPP_A
-#include "opencv2/core/ippasync.hpp"
-
-using namespace cv;
-using namespace std;
-using namespace opencv_test;
-
-namespace opencv_test {
-namespace ocl {
-
-PARAM_TEST_CASE(IPPAsync, MatDepth, Channels, hppAccelType)
-{
-    int type;
-    int cn;
-    int depth;
-    hppAccelType accelType;
-
-    Mat matrix, result;
-    hppiMatrix * hppMat;
-    hppAccel accel;
-    hppiVirtualMatrix * virtMatrix;
-    hppStatus sts;
-
-    virtual void SetUp()
-    {
-        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
-        depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
-        accelType = GET_PARAM(2);
-    }
-
-    void generateTestData()
-    {
-        Size matrix_Size = randomSize(2, 100);
-        const double upValue = 100;
-
-        matrix = randomMat(matrix_Size, type, -upValue, upValue);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        EXPECT_MAT_NEAR(matrix, result, threshold);
-    }
-};
-
-TEST_P(IPPAsync, accuracy)
-{
-    sts = hppCreateInstance(accelType, 0, &accel);
-    if (sts!=HPP_STATUS_NO_ERROR) printf("hppStatus = %d\n",sts);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-    virtMatrix = hppiCreateVirtualMatrices(accel, 2);
-
-    for (int j = 0; j < test_loop_times; j++)
-    {
-        generateTestData();
-        hppMat = hpp::getHpp(matrix,accel);
-
-        hppScalar a = 3;
-
-        sts = hppiAddC(accel, hppMat, a, 0, virtMatrix[0]);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-        sts = hppiSubC(accel, virtMatrix[0], a, 0, virtMatrix[1]);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-        sts = hppWait(accel, HPP_TIME_OUT_INFINITE);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-        result = hpp::getMat(virtMatrix[1], accel, cn);
-
-        Near(5.0e-6);
-
-        sts =  hppiFreeMatrix(hppMat);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-    }
-
-    sts = hppiDeleteVirtualMatrices(accel, virtMatrix);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-    sts = hppDeleteInstance(accel);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-}
-
-PARAM_TEST_CASE(IPPAsyncShared, Channels, hppAccelType)
-{
-    int cn;
-    int type;
-    hppAccelType accelType;
-
-    Mat matrix, result;
-    hppiMatrix* hppMat;
-    hppAccel accel;
-    hppiVirtualMatrix * virtMatrix;
-    hppStatus sts;
-
-    virtual void SetUp()
-    {
-        cn = GET_PARAM(0);
-        accelType = GET_PARAM(1);
-        type=CV_MAKE_TYPE(CV_8U, GET_PARAM(0));
-    }
-
-    void generateTestData()
-    {
-        Size matrix_Size = randomSize(2, 100);
-        hpp32u pitch, size;
-        const int upValue = 100;
-
-        sts = hppQueryMatrixAllocParams(accel, (hpp32u)(matrix_Size.width*cn), (hpp32u)matrix_Size.height, HPP_DATA_TYPE_8U, &pitch, &size);
-
-        matrix = randomMat(matrix_Size, type, 0, upValue);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        EXPECT_MAT_NEAR(matrix, result, threshold);
-    }
-};
-
-TEST_P(IPPAsyncShared, accuracy)
-{
-    sts = hppCreateInstance(accelType, 0, &accel);
-    if (sts!=HPP_STATUS_NO_ERROR) printf("hppStatus = %d\n",sts);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-    virtMatrix = hppiCreateVirtualMatrices(accel, 2);
-
-    for (int j = 0; j < test_loop_times; j++)
-    {
-        generateTestData();
-        hppMat = hpp::getHpp(matrix,accel);
-
-        hppScalar a = 3;
-
-        sts = hppiAddC(accel, hppMat, a, 0, virtMatrix[0]);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-        sts = hppiSubC(accel, virtMatrix[0], a, 0, virtMatrix[1]);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-        sts = hppWait(accel, HPP_TIME_OUT_INFINITE);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-
-        result = hpp::getMat(virtMatrix[1], accel, cn);
-
-        Near(0);
-
-        sts =  hppiFreeMatrix(hppMat);
-        CV_Assert(sts==HPP_STATUS_NO_ERROR);
-    }
-
-    sts = hppiDeleteVirtualMatrices(accel, virtMatrix);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-    sts = hppDeleteInstance(accel);
-    CV_Assert(sts==HPP_STATUS_NO_ERROR);
-}
-
-INSTANTIATE_TEST_CASE_P(IppATest, IPPAsyncShared, Combine(Values(1, 2, 3, 4),
-                                                    Values( HPP_ACCEL_TYPE_CPU, HPP_ACCEL_TYPE_GPU)));
-
-INSTANTIATE_TEST_CASE_P(IppATest, IPPAsync, Combine(Values(CV_8U, CV_16U, CV_16S, CV_32F),
-                                                   Values(1, 2, 3, 4),
-                                                   Values( HPP_ACCEL_TYPE_CPU, HPP_ACCEL_TYPE_GPU)));
-
-}
-}
-#endif
--- a/modules/core/test/test_lpsolver.cpp
+++ b/modules/core/test/test_lpsolver.cpp
@ -141,4 +141,14 @@ TEST(Core_LPSolver, regression_cycling){
 #endif
 }

+TEST(Core_LPSolver, issue_12337)
+{
+    Mat A=(cv::Mat_<double>(3,1)<<3,1,2);
+    Mat B=(cv::Mat_<double>(3,4)<<1,1,3,30,2,2,5,24,4,1,2,36);
+    EXPECT_ANY_THROW(Mat1f z_float; cv::solveLP(A, B, z_float));
+    EXPECT_NO_THROW(Mat1d z_double; cv::solveLP(A, B, z_double));
+    EXPECT_ANY_THROW(Mat1i z_int; cv::solveLP(A, B, z_int));
+    //need to update interface: EXPECT_ANY_THROW(Mat1b z_8u; cv::solveLP(A, B, z_8u));
+}
+
 }} // namespace
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -1872,4 +1872,63 @@ TEST(Core_Split, crash_12171)
    EXPECT_EQ(2, dst2.ptr<uchar>(1)[1]);
 }

+struct CustomType  // like cv::Keypoint
+{
+    Point2f pt;
+    float size;
+    float angle;
+    float response;
+    int octave;
+    int class_id;
+};
+
+static void test_CustomType(InputArray src_, OutputArray dst_)
+{
+    Mat src = src_.getMat();
+    ASSERT_EQ(sizeof(CustomType), src.elemSize());
+    CV_CheckTypeEQ(src.type(), CV_MAKETYPE(CV_8U, sizeof(CustomType)), "");
+
+    CustomType* kpt = NULL;
+    {
+        Mat dst = dst_.getMat();
+        for (size_t i = 0; i < dst.total(); i++)
+        {
+            kpt = dst.ptr<CustomType>(0) + i;
+            kpt->octave = (int)i;
+        }
+    }
+    const int N = (int)src.total();
+    dst_.create(1, N * 2, rawType<CustomType>());
+    Mat dst = dst_.getMat();
+    for (size_t i = N; i < dst.total(); i++)
+    {
+        kpt = dst.ptr<CustomType>(0) + i;
+        kpt->octave = -(int)i;
+    }
+#if 0 // Compilation error
+    CustomType& kpt = dst.at<CustomType>(0, 5);
+#endif
+}
+
+TEST(Core_InputArray, support_CustomType)
+{
+    std::vector<CustomType> kp1(5);
+    std::vector<CustomType> kp2(3);
+    test_CustomType(rawIn(kp1), rawOut(kp2));
+    ASSERT_EQ((size_t)10, kp2.size());
+    for (int i = 0; i < 3; i++)
+    {
+        EXPECT_EQ(i, kp2[i].octave);
+    }
+    for (int i = 3; i < 5; i++)
+    {
+        EXPECT_EQ(0, kp2[i].octave);
+    }
+    for (int i = 5; i < 10; i++)
+    {
+        EXPECT_EQ(-i, kp2[i].octave);
+    }
+}
+
+
 }} // namespace
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@ -222,7 +222,7 @@ CUDA_TEST_P(ORB, Accuracy)
        {
            std::vector<cv::KeyPoint> keypoints;
            cv::cuda::GpuMat descriptors;
-            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), keypoints, descriptors);
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), rawOut(keypoints), descriptors);
        }
        catch (const cv::Exception& e)
        {
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -95,7 +95,7 @@ ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
 ocv_create_module(${libs} ${INF_ENGINE_TARGET})
 ocv_add_samples()
 ocv_add_accuracy_tests(${INF_ENGINE_TARGET})
-ocv_add_perf_tests()
+ocv_add_perf_tests(${INF_ENGINE_TARGET})

 ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
 ocv_option(${the_module}_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -878,6 +878,14 @@ CV__DNN_INLINE_NS_BEGIN
    CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst,
                                       const std::vector<String>& layersTypes = std::vector<String>());

+    /** @brief Create a text representation for a binary network stored in protocol buffer format.
+     *  @param[in] model  A path to binary network.
+     *  @param[in] output A path to output text file to be created.
+     *
+     *  @note To reduce output file size, trained weights are not included.
+     */
+    CV_EXPORTS_W void writeTextGraph(const String& model, const String& output);
+
    /** @brief Performs non maximum suppression given boxes and corresponding scores.

     * @param bboxes a set of bounding boxes to apply NMS.
--- a/modules/dnn/misc/java/test/DnnListRegressionTest.java
+++ b/modules/dnn/misc/java/test/DnnListRegressionTest.java
@ -0,0 +1,119 @@
+package org.opencv.test.dnn;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfInt;
+import org.opencv.core.MatOfFloat;
+import org.opencv.core.MatOfByte;
+import org.opencv.core.Scalar;
+import org.opencv.core.Size;
+import org.opencv.dnn.DictValue;
+import org.opencv.dnn.Dnn;
+import org.opencv.dnn.Layer;
+import org.opencv.dnn.Net;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.test.OpenCVTestCase;
+
+/*
+*  regression test for #12324,
+*    testing various java.util.List invocations,
+*    which use the LIST_GET macro
+*/
+
+public class DnnListRegressionTest extends OpenCVTestCase {
+
+    private final static String ENV_OPENCV_DNN_TEST_DATA_PATH = "OPENCV_DNN_TEST_DATA_PATH";
+
+    private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
+
+    String modelFileName = "";
+    String sourceImageFile = "";
+
+    Net net;
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+
+        String envDnnTestDataPath = System.getenv(ENV_OPENCV_DNN_TEST_DATA_PATH);
+
+        if(envDnnTestDataPath == null){
+            isTestCaseEnabled = false;
+            return;
+        }
+
+        File dnnTestDataPath = new File(envDnnTestDataPath);
+        modelFileName =  new File(dnnTestDataPath, "dnn/tensorflow_inception_graph.pb").toString();
+
+        String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+
+        if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+
+        File testDataPath = new File(envTestDataPath);
+
+        File f = new File(testDataPath, "dnn/grace_hopper_227.png");
+        sourceImageFile = f.toString();
+        if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
+
+        net = Dnn.readNetFromTensorflow(modelFileName);
+
+        Mat image = Imgcodecs.imread(sourceImageFile);
+        assertNotNull("Loading image from file failed!", image);
+
+        Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
+        assertNotNull("Converting image to blob failed!", inputBlob);
+
+        net.setInput(inputBlob, "input");
+    }
+
+    public void testSetInputsNames() {
+        List<String> inputs = new ArrayList();
+        inputs.add("input");
+        try {
+            net.setInputsNames(inputs);
+        } catch(Exception e) {
+            fail("Net setInputsNames failed: " + e.getMessage());
+        }
+    }
+
+    public void testForward() {
+        List<Mat> outs = new ArrayList();
+        List<String> outNames = new ArrayList();
+        outNames.add("softmax2");
+        try {
+            net.forward(outs,outNames);
+        } catch(Exception e) {
+            fail("Net forward failed: " + e.getMessage());
+        }
+    }
+
+    public void testGetMemoryConsumption() {
+        int layerId = 1;
+        List<MatOfInt> netInputShapes = new ArrayList();
+        netInputShapes.add(new MatOfInt(1, 3, 224, 224));
+        long[] weights=null;
+        long[] blobs=null;
+        try {
+            net.getMemoryConsumption(layerId, netInputShapes, weights, blobs);
+        } catch(Exception e) {
+            fail("Net getMemoryConsumption failed: " + e.getMessage());
+        }
+    }
+
+    public void testGetFLOPS() {
+        int layerId = 1;
+        List<MatOfInt> netInputShapes = new ArrayList();
+        netInputShapes.add(new MatOfInt(1, 3, 224, 224));
+        try {
+            net.getFLOPS(layerId, netInputShapes);
+        } catch(Exception e) {
+            fail("Net getFLOPS failed: " + e.getMessage());
+        }
+    }
+}
--- a/modules/dnn/perf/opencl/perf_convolution.cpp
+++ b/modules/dnn/perf/opencl/perf_convolution.cpp
@ -1,107 +0,0 @@
-#include "../perf_precomp.hpp"
-#include "opencv2/ts/ocl_perf.hpp"
-#include <opencv2/dnn/shape_utils.hpp>
-
-#ifdef HAVE_OPENCL
-
-namespace opencv_test { namespace ocl {
-using namespace ::perf;
-
-namespace {
-enum {STRIDE_OFF = 1, STRIDE_ON = 2};
-CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
-
-enum {GROUP_OFF = 1, GROUP_2 = 2};
-CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
-} // namespace
-
-//Squared Size
-#define SSZ(n) cv::Size(n, n)
-
-typedef std::pair<MatShape, int> InpShapeNumOut;
-typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
-typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
-
-static inline MatShape blobShape(int count, int nplanes, int height, int width)
-{
-    int data[] = {count, nplanes, height, width};
-    return MatShape(data, data+4);
-}
-
-OCL_PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
-    Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
-    Values(make_pair(blobShape(1,   4, 224, 224),  64),
-           make_pair(blobShape(1,  64, 112, 122), 128),
-           make_pair(blobShape(1, 256,  28,  28), 512)),
-    GroupSize::all(),
-    StrideSize::all())
-)
-{
-    RNG rng(0);
-
-    ConvParam params = GetParam();
-    int ksz     = get<0>(params).width;
-    MatShape inpShape = get<1>(params).first;
-    int outCn   = get<1>(params).second;
-    int groups  = get<2>(params);
-    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
-
-    int inpCn = inpShape[1];
-    int wgtSize[] = { outCn, inpCn/groups, ksz, ksz };
-    int biasSize[] = { outCn, 1, 1, 1 };
-    const int wtype = CV_32F;
-    Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype);
-    Mat inpBlob(4, &inpShape[0], wtype);
-    rng.fill(biasBlob, RNG::UNIFORM, -1, +1);
-    rng.fill(wgtBlob, RNG::UNIFORM, -1, +1);
-    rng.fill(inpBlob, RNG::UNIFORM, -1, +1);
-
-    LayerParams lp;
-    lp.set("num_output", outCn);
-    lp.set("group", groups);
-    lp.set("stride", stride);
-    lp.set("kernel_size", ksz);
-    lp.blobs.reserve(2);
-    lp.blobs.push_back(wgtBlob);
-    lp.blobs.push_back(biasBlob);
-
-    std::vector<Mat*> inpBlobs(1, &inpBlob);
-    std::vector<Mat> outBlobs, internalBlobs;
-
-    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
-    std::vector<MatShape> inputShapes(1, shape(inpBlob)), outShapes, internals;
-    layer->getMemoryShapes(inputShapes, 0, outShapes, internals);
-    for (size_t i = 0; i < outShapes.size(); i++)
-    {
-        outBlobs.push_back(Mat(outShapes[i], CV_32F));
-    }
-    for (size_t i = 0; i < internals.size(); i++)
-    {
-        internalBlobs.push_back(Mat());
-        if (total(internals[i]))
-            internalBlobs.back().create(internals[i], CV_32F);
-    }
-
-    layer->finalize(inpBlobs, outBlobs);
-    layer->preferableTarget = DNN_TARGET_OPENCL;
-
-    Mat inpBlob2D = inpBlob.reshape(1, outCn);
-    Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
-    Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]);
-    declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D);
-
-    // warmup
-    layer->forward(inpBlobs, outBlobs, internalBlobs);
-
-    TEST_CYCLE()
-    {
-        layer->forward(inpBlobs, outBlobs, internalBlobs);
-    }
-
-    SANITY_CHECK_NOTHING();
-}
-
-}
-}
-
-#endif
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@ -1,92 +1,674 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
 #include "perf_precomp.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

 namespace opencv_test {

-enum {STRIDE_OFF = 1, STRIDE_ON = 2};
-CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
-
-enum {GROUP_OFF = 1, GROUP_2 = 2};
-CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
-
-typedef std::pair<MatShape, int> InpShapeNumOut;
-typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
-typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
-
-static inline MatShape blobShape(int count, int nplanes, int height, int width)
+// Flops_Kernel_Input_OutCN_Group_Stride_Pad_Dilation_PadAdjust_PadMode_Bias
+struct TestSize_ {
+    int width, height;
+    operator Size() const { return Size(width, height); }
+};
+struct ConvParam_t {
+    struct TestSize_ kernel;
+    struct BlobShape { int dims[4]; } shapeIn;
+    int outCN;
+    int groups;
+    struct TestSize_ stride;
+    struct TestSize_ dilation;
+    struct TestSize_ pad;
+    struct TestSize_ padAdjust;
+    const char* padMode;
+    bool hasBias;
+    double declared_flops;
+};
+// Details: #12142
+static const ConvParam_t testConvolutionConfigs[] = {
+    /* GFLOPS 10.087 x 1 = 10.087 */ {{3, 3}, {{1, 576, 38, 50}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10086963200.},
+    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
+    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703596544.},
+    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 150, 200}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6641280000.},
+    /* GFLOPS 1.659 x 3 = 4.977 */ {{3, 3}, {{1, 960, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1658976000.},
+    /* GFLOPS 2.156 x 2 = 4.312 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2156088384.},
+    /* GFLOPS 0.958 x 4 = 3.833 */ {{3, 3}, {{1, 384, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958307712.},
+    /* GFLOPS 0.830 x 4 = 3.321 */ {{3, 3}, {{1, 64, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 830160000.},
+    /* GFLOPS 1.245 x 2 = 2.490 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1244880000.},
+    /* GFLOPS 2.100 x 1 = 2.100 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2100330000.},
+    /* GFLOPS 1.022 x 2 = 2.044 */ {{3, 3}, {{1, 576, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1021896057.},
+    /* GFLOPS 0.958 x 2 = 1.917 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958446336.},
+    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1887539200.},
+    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1024, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1887539200.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703781376.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703781376.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1659600000.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1659600000.},
+    /* GFLOPS 0.280 x 5 = 1.402 */ {{1, 1}, {{1, 576, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 280409600.},
+    /* GFLOPS 0.701 x 2 = 1.401 */ {{3, 3}, {{1, 128, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 128, 56, 56}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 256, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
+    /* GFLOPS 0.210 x 6 = 1.262 */ {{1, 1}, {{1, 576, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.420 x 3 = 1.261 */ {{3, 3}, {{1, 96, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420492800.},
+    /* GFLOPS 1.261 x 1 = 1.261 */ {{3, 3}, {{1, 192, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1261113600.},
+    /* GFLOPS 1.258 x 1 = 1.258 */ {{3, 3}, {{1, 1280, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258038600.},
+    /* GFLOPS 1.245 x 1 = 1.245 */ {{3, 3}, {{1, 64, 75, 75}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1245240000.},
+    /* GFLOPS 0.561 x 2 = 1.121 */ {{3, 3}, {{1, 128, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 560576000.},
+    /* GFLOPS 1.051 x 1 = 1.051 */ {{3, 3}, {{1, 160, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1050988800.},
+    /* GFLOPS 1.006 x 1 = 1.006 */ {{3, 3}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1006441800.},
+    /* GFLOPS 0.246 x 4 = 0.985 */ {{1, 1}, {{1, 256, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 246240000.},
+    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189452800.},
+    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189452800.},
+    /* GFLOPS 0.934 x 1 = 0.934 */ {{3, 3}, {{1, 96, 150, 150}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 933660000.},
+    /* GFLOPS 0.231 x 4 = 0.925 */ {{3, 3}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.896 x 1 = 0.896 */ {{5, 5}, {{1, 96, 27, 27}}, 256, 2, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 895981824.},
+    /* GFLOPS 0.876 x 1 = 0.876 */ {{3, 3}, {{1, 160, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 875824000.},
+    /* GFLOPS 0.850 x 1 = 0.850 */ {{7, 7}, {{1, 3, 600, 800}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 849600000.},
+    /* GFLOPS 0.841 x 1 = 0.841 */ {{3, 3}, {{1, 128, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 840864000.},
+    /* GFLOPS 0.415 x 2 = 0.831 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415440000.},
+    /* GFLOPS 0.351 x 2 = 0.701 */ {{1, 1}, {{1, 576, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 350512000.},
+    /* GFLOPS 0.701 x 1 = 0.701 */ {{3, 3}, {{1, 128, 75, 100}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 694235136.},
+    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 694235136.},
+    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231411712.},
+    /* GFLOPS 0.058 x 12 = 0.694 */ {{3, 3}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 57827840.},
+    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 512, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
+    /* GFLOPS 0.160 x 4 = 0.639 */ {{3, 3}, {{1, 64, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159833472.},
+    /* GFLOPS 0.103 x 6 = 0.618 */ {{1, 1}, {{1, 256, 14, 14}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
+    /* GFLOPS 0.615 x 1 = 0.615 */ {{1, 1}, {{1, 320, 75, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615360000.},
+    /* GFLOPS 0.597 x 1 = 0.597 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 597254400.},
+    /* GFLOPS 0.185 x 3 = 0.554 */ {{1, 1}, {{1, 192, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 184800000.},
+    /* GFLOPS 0.553 x 1 = 0.553 */ {{3, 3}, {{1, 64, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 553440000.},
+    /* GFLOPS 0.539 x 1 = 0.539 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 539178048.},
+    /* GFLOPS 0.103 x 5 = 0.514 */ {{1, 1}, {{1, 1024, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102810624.},
+    /* GFLOPS 0.491 x 1 = 0.491 */ {{1, 1}, {{1, 576, 38, 50}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 490716800.},
+    /* GFLOPS 0.240 x 2 = 0.479 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239680896.},
+    /* GFLOPS 0.237 x 2 = 0.474 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 236830720.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 471910400.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
+    /* GFLOPS 0.449 x 1 = 0.449 */ {{3, 3}, {{1, 384, 13, 13}}, 384, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 448626048.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 425945344.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 425945344.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 425945344.},
+    /* GFLOPS 0.421 x 1 = 0.421 */ {{1, 1}, {{1, 576, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420614400.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 415440000.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 415080000.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415080000.},
+    /* GFLOPS 0.104 x 4 = 0.414 */ {{1, 1}, {{1, 64, 56, 56}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103563264.},
+    /* GFLOPS 0.103 x 4 = 0.413 */ {{1, 1}, {{1, 128, 28, 28}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
+    /* GFLOPS 0.376 x 1 = 0.376 */ {{1, 1}, {{1, 24, 300, 400}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 376320000.},
+    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 346967040.},
+    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 346967040.},
+    /* GFLOPS 0.014 x 24 = 0.347 */ {{3, 3}, {{1, 128, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 14456960.},
+    /* GFLOPS 0.053 x 6 = 0.320 */ {{1, 1}, {{1, 576, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53277824.},
+    /* GFLOPS 0.319 x 1 = 0.319 */ {{3, 3}, {{1, 192, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 319482112.},
+    /* GFLOPS 0.315 x 1 = 0.315 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 315369600.},
+    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 7, 7}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
+    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
+    /* GFLOPS 0.308 x 1 = 0.308 */ {{1, 1}, {{1, 320, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 307680000.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 256, 13, 13}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299105664.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 384, 13, 13}}, 256, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299084032.},
+    /* GFLOPS 0.017 x 17 = 0.290 */ {{1, 1}, {{1, 32, 32, 64}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17039360.},
+    /* GFLOPS 0.017 x 16 = 0.269 */ {{1, 1}, {{1, 128, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16842752.},
+    /* GFLOPS 0.133 x 2 = 0.266 */ {{3, 3}, {{1, 128, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.038 x 7 = 0.265 */ {{3, 3}, {{1, 16, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37879808.},
+    /* GFLOPS 0.126 x 2 = 0.252 */ {{3, 3}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 125812050.},
+    /* GFLOPS 0.248 x 1 = 0.248 */ {{1, 1}, {{1, 64, 150, 200}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 247680000.},
+    /* GFLOPS 0.040 x 6 = 0.240 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
+    /* GFLOPS 0.080 x 3 = 0.240 */ {{3, 3}, {{1, 96, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79893632.},
+    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 239611584.},
+    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239611584.},
+    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 236830720.},
+    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 236830720.},
+    /* GFLOPS 0.111 x 2 = 0.221 */ {{3, 3}, {{1, 192, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110624000.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.107 x 2 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106509440.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 212972672.},
+    /* GFLOPS 0.212 x 1 = 0.212 */ {{7, 7}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 212400000.},
+    /* GFLOPS 0.211 x 1 = 0.211 */ {{11, 11}, {{1, 3, 227, 227}}, 96, 1, {4, 4}, {1, 1}, {0, 0}, {0, 0}, "", true, 211120800.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{3, 3}, {{1, 64, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 209817600.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
+    /* GFLOPS 0.104 x 2 = 0.208 */ {{3, 3}, {{1, 32, 75, 75}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 103860000.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
+    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 1024, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 2048, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
+    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 2048, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102785536.},
+    /* GFLOPS 0.201 x 1 = 0.201 */ {{1, 1}, {{1, 512, 14, 14}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 200900000.},
+    /* GFLOPS 0.200 x 1 = 0.200 */ {{3, 3}, {{1, 160, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 199687872.},
+    /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189637632.},
+    /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189637632.},
+    /* GFLOPS 0.047 x 4 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 47409408.},
+    /* GFLOPS 0.038 x 5 = 0.189 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 185040000.},
+    /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 185040000.},
+    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 180696320.},
+    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 180696320.},
+    /* GFLOPS 0.090 x 2 = 0.181 */ {{3, 3}, {{1, 224, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 90339200.},
+    /* GFLOPS 0.180 x 1 = 0.180 */ {{1, 1}, {{1, 224, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 180232192.},
+    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 173508608.},
+    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 173508608.},
+    /* GFLOPS 0.166 x 1 = 0.166 */ {{3, 3}, {{1, 160, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 166406560.},
+    /* GFLOPS 0.080 x 2 = 0.160 */ {{1, 1}, {{1, 576, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79916736.},
+    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159764160.},
+    /* GFLOPS 0.159 x 1 = 0.159 */ {{7, 7}, {{1, 3, 300, 300}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 159300000.},
+    /* GFLOPS 0.155 x 1 = 0.155 */ {{1, 1}, {{1, 192, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 154542080.},
+    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 146369664.},
+    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 146369664.},
+    /* GFLOPS 0.072 x 2 = 0.144 */ {{1, 1}, {{1, 1024, 10, 10}}, 352, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 72124800.},
+    /* GFLOPS 0.140 x 1 = 0.140 */ {{1, 1}, {{1, 576, 38, 50}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140204800.},
+    /* GFLOPS 0.017 x 8 = 0.138 */ {{1, 1}, {{1, 16, 64, 128}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17301504.},
+    /* GFLOPS 0.067 x 2 = 0.133 */ {{1, 1}, {{1, 576, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 66597280.},
+    /* GFLOPS 0.133 x 1 = 0.133 */ {{3, 3}, {{1, 128, 38, 38}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.129 x 1 = 0.129 */ {{1, 1}, {{1, 160, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 128851968.},
+    /* GFLOPS 0.128 x 1 = 0.128 */ {{3, 3}, {{1, 64, 24, 24}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 127512576.},
+    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 120497664.},
+    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 120497664.},
+    /* GFLOPS 0.040 x 3 = 0.120 */ {{1, 1}, {{1, 96, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 40131648.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{1, 1}, {{1, 320, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 118477312.},
+    /* GFLOPS 0.017 x 7 = 0.118 */ {{1, 1}, {{1, 64, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.039 x 3 = 0.118 */ {{1, 1}, {{1, 1024, 10, 10}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39340800.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 117990400.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 58003456.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57903104.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57852928.},
+    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 115655680.},
+    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 115655680.},
+    /* GFLOPS 0.112 x 1 = 0.112 */ {{1, 1}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111875400.},
+    /* GFLOPS 0.036 x 3 = 0.107 */ {{1, 1}, {{1, 192, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 35580160.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 106648064.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 64, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106555648.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 104960000.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
+    /* GFLOPS 0.103 x 1 = 0.103 */ {{1, 1}, {{1, 128, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
+    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 51480576.},
+    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 51480576.},
+    /* GFLOPS 0.101 x 1 = 0.101 */ {{1, 1}, {{1, 512, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 101016825.},
+    /* GFLOPS 0.096 x 1 = 0.096 */ {{1, 1}, {{1, 480, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 96438272.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 95003648.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 95003648.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 94818816.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 94818816.},
+    /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 93600000.},
+    /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 93600000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 512, 38, 50}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93480000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 576, 19, 19}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93236192.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 92880000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 92880000.},
+    /* GFLOPS 0.031 x 3 = 0.092 */ {{1, 1}, {{1, 160, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30816000.},
+    /* GFLOPS 0.092 x 1 = 0.092 */ {{1, 1}, {{1, 192, 75, 100}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 92400000.},
+    /* GFLOPS 0.090 x 1 = 0.090 */ {{1, 1}, {{1, 448, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 90015744.},
+    /* GFLOPS 0.045 x 2 = 0.090 */ {{3, 3}, {{1, 576, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44918508.},
+    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 88554368.},
+    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 88554368.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{5, 1}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {2, 0}, {0, 0}, "", false, 21037056.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{1, 5}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 2}, {0, 0}, "", true, 21037056.},
+    /* GFLOPS 0.084 x 1 = 0.084 */ {{1, 1}, {{1, 416, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 83593216.},
+    /* GFLOPS 0.082 x 1 = 0.082 */ {{1, 1}, {{1, 320, 10, 10}}, 1280, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 82048000.},
+    /* GFLOPS 0.040 x 2 = 0.080 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39958368.},
+    /* GFLOPS 0.040 x 2 = 0.079 */ {{1, 1}, {{1, 24, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39690000.},
+    /* GFLOPS 0.040 x 2 = 0.079 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39600000.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 96, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77471744.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{3, 3}, {{1, 192, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 77436800.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 384, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77170688.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {8, 8}, {8, 8}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {4, 4}, {4, 4}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {16, 16}, {16, 16}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.018 x 4 = 0.072 */ {{1, 1}, {{1, 64, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17882496.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 16, 150, 150}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 71280000.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 352, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 70748160.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 24, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 70560000.},
+    /* GFLOPS 0.070 x 1 = 0.070 */ {{3, 3}, {{1, 96, 14, 14}}, 208, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 70487872.},
+    /* GFLOPS 0.069 x 1 = 0.069 */ {{3, 3}, {{1, 96, 14, 14}}, 204, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 69132336.},
+    /* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 1280, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 65561600.},
+    /* GFLOPS 0.033 x 2 = 0.065 */ {{3, 3}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 32551680.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 65046912.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 65046912.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 160, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 64534400.},
+    /* GFLOPS 0.064 x 1 = 0.064 */ {{1, 1}, {{1, 320, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 64325632.},
+    /* GFLOPS 0.032 x 2 = 0.064 */ {{3, 3}, {{1, 96, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 31868928.},
+    /* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 61472000.},
+    /* GFLOPS 0.031 x 2 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30736000.},
+    /* GFLOPS 0.060 x 1 = 0.060 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59920224.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{1, 1}, {{1, 320, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59238656.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 59008000.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 58995200.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 58995200.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 58995200.},
+    /* GFLOPS 0.058 x 1 = 0.058 */ {{1, 1}, {{1, 288, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 57903104.},
+    /* GFLOPS 0.004 x 16 = 0.058 */ {{3, 3}, {{1, 128, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 3614240.},
+    /* GFLOPS 0.055 x 1 = 0.055 */ {{3, 3}, {{1, 1280, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55298400.},
+    /* GFLOPS 0.018 x 3 = 0.054 */ {{1, 1}, {{1, 32, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18021120.},
+    /* GFLOPS 0.018 x 3 = 0.053 */ {{1, 1}, {{1, 384, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17766976.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{3, 3}, {{1, 128, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 53254720.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 53036032.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53036032.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52454400.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 52454400.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52454400.},
+    /* GFLOPS 0.026 x 2 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26227200.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 64, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51781632.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51430400.},
+    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25715200.},
+    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25715200.},
+    /* GFLOPS 0.013 x 4 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12857600.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51405312.},
+    /* GFLOPS 0.050 x 1 = 0.050 */ {{1, 1}, {{1, 992, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 49799680.},
+    /* GFLOPS 0.048 x 1 = 0.048 */ {{1, 1}, {{1, 960, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 48194048.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 47409408.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 512, 38, 50}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46740000.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 928, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 46588416.},
+    /* GFLOPS 0.046 x 1 = 0.046 */ {{1, 1}, {{1, 64, 75, 75}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46440000.},
+    /* GFLOPS 0.023 x 2 = 0.045 */ {{3, 3}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22648626.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 45174080.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 45174080.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 224, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 45058048.},
+    /* GFLOPS 0.023 x 2 = 0.045 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 22500800.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 896, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44982784.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 3, 227, 227}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 44946880.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44256000.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44239200.},
+    /* GFLOPS 0.043 x 1 = 0.043 */ {{7, 7}, {{1, 3, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 43499520.},
+    /* GFLOPS 0.043 x 1 = 0.043 */ {{1, 1}, {{1, 864, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 43377152.},
+    /* GFLOPS 0.042 x 1 = 0.042 */ {{1, 1}, {{1, 832, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 41771520.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{1, 1}, {{1, 800, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 64, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 256, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 39932376.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 39600000.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 144, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39015000.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 192, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38635520.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 768, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38560256.},
+    /* GFLOPS 0.037 x 1 = 0.037 */ {{1, 1}, {{1, 736, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 36954624.},
+    /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 36164352.},
+    /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 36164352.},
+    /* GFLOPS 0.018 x 2 = 0.036 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17790080.},
+    /* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 704, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 35348992.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 672, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33743360.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 128, 32, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33685504.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{2, 2}, {{1, 64, 64, 128}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 33619968.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 33147520.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 33147520.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 1024, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 32784000.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 160, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32212992.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 512, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32144000.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 640, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32137728.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 508, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31893120.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 31328640.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31328640.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 608, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 30532096.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15065344.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15065344.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15059072.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{3, 3}, {{1, 256, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 29497600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28976640.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28976640.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28929600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28929600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 576, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 28926464.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 544, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 27320832.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 384, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26650464.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 576, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26638912.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{3, 3}, {{1, 128, 38, 38}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 26627360.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26518016.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26518016.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 96, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26055000.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 1024, 10, 10}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25817400.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25790464.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25740288.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25740288.},
+    /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12870144.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25715200.},
+    /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12857600.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 480, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 24109568.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 23750912.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23704704.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{3, 3}, {{1, 3, 256, 512}}, 13, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 23429120.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 32, 150, 150}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23400000.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 19, 19}}, 63, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23311575.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 448, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 22503936.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22500800.},
+    /* GFLOPS 0.022 x 1 = 0.022 */ {{1, 1}, {{1, 508, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22325184.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{3, 3}, {{1, 128, 12, 12}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 21242880.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 416, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 20898304.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 20885760.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20885760.},
+    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 10442880.},
+    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10442880.},
+    /* GFLOPS 0.010 x 2 = 0.020 */ {{3, 3}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10066056.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20095488.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20095488.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20082944.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20082944.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{3, 3}, {{1, 256, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 19966188.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19317760.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19317760.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 384, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 19292672.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 576, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18448000.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 18082176.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 18082176.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 17790080.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 352, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17687040.},
+    /* GFLOPS 0.017 x 1 = 0.017 */ {{2, 2}, {{1, 16, 128, 256}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 320, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16081408.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15664320.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15664320.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15059072.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 14754816.},
+    /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 288, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 14475776.},
+    /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13991250.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 144, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 13354112.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6623232.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13053600.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13053600.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6522880.},
+    /* GFLOPS 0.006 x 2 = 0.013 */ {{1, 1}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 128, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12895232.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12870144.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12870144.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 508, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12757248.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 992, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12449920.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12054784.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12054784.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 960, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12048512.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 12014080.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{3, 3}, {{1, 96, 6, 6}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11950848.},
+    /* GFLOPS 0.006 x 2 = 0.012 */ {{3, 3}, {{1, 96, 3, 3}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5975424.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 320, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11814912.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 640, 6, 6}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11805696.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 928, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11647104.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 896, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11245696.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{3, 3}, {{1, 256, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11061600.},
+    /* GFLOPS 0.006 x 2 = 0.011 */ {{3, 3}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5530200.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 864, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10844288.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10442880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{5, 5}, {{1, 32, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 10041472.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 800, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10041472.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9658880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 9658880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 384, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9646336.},
+    /* GFLOPS 0.005 x 2 = 0.010 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4821600.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 768, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9640064.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{3, 3}, {{1, 4, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9568256.},
+    /* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 4, 128, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4718592.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 736, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9238656.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 192, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 8895040.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 704, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8837248.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 672, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8435840.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 128, 32, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8421376.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 640, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8034432.},
+    /* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3916080.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 608, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7633024.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 7535808.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 7535808.},
+    /* GFLOPS 0.004 x 2 = 0.007 */ {{3, 3}, {{1, 64, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3689600.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 640, 6, 6}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7378560.},
+    /* GFLOPS 0.004 x 2 = 0.007 */ {{1, 1}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3650304.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 384, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7234752.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 576, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7231616.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7091712.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 544, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6830208.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{3, 3}, {{1, 160, 6, 6}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 6637824.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6629504.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6629504.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 5, 5}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6566400.},
+    /* GFLOPS 0.003 x 2 = 0.007 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 3280000.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 64, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6447616.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6428800.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6428800.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6428800.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{3, 3}, {{1, 256, 10, 10}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5530800.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5322240.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4917600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4917600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4829440.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4829440.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4826304.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4821600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 508, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4783968.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 24, 24}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4755456.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4727808.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4720896.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4440300.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4440300.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 640, 6, 6}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4427136.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 16, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4325376.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 64, 64, 128}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4227072.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3916080.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 16, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3691008.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 64, 10, 10}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 3689600.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 64, 6, 6}}, 128, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3687552.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 192, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3548160.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 736, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3393792.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 10, 10}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3283200.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3280000.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3280000.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3228750.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3013696.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3013696.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 320, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2953728.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 640, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2951424.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 128, 5, 5}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2655360.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 832, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2610720.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2520882.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258530.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2363904.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 528, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2164736.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 508, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2082816.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 1, 1}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2049000.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{3, 3}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 995544.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{3, 3}, {{1, 128, 5, 5}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 922000.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1770336.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 6, 6}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1475712.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1383000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 736, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1272672.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 590976.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180160.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1120392.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 461000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 192, 12, 12}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 887040.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 442464.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 128, 5, 5}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 691500.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 2, 2}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 655872.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 615000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 592128.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590976.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 590976.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 581742.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 525312.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 192, 5, 5}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 308000.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 2, 2}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 263168.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 131328.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 258552.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 1024, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 196704.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 736, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 141408.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140322.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 131328.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 131328.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 110808.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110808.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55320.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 73792.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49248.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 49248.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32382.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 1, 1}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16512.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6168.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6168.}
+};
+struct ConvParamID
 {
-    int data[] = {count, nplanes, height, width};
-    return MatShape(data, data+4);
+    enum {
+        CONV_0 = 0,
+        CONV_100 = 100,
+        CONV_LAST = sizeof(testConvolutionConfigs) / sizeof(testConvolutionConfigs[0])
+    };
+    int val_;                                                                  \
+    ConvParamID(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<ConvParamID> all()
+    {
+#if 0
+        enum { NUM = (int)CONV_LAST };
+#else
+        enum { NUM = (int)CONV_100 };
+#endif
+        ConvParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = ConvParamID(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};                                                                                  \
+static inline void PrintTo(const ConvParamID& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < ConvParamID::CONV_LAST);
+    const ConvParam_t& p = testConvolutionConfigs[(int)v];
+
+    *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
+        << ", K=" << (Size)p.kernel
+        << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << "}"
+        << ", OCN=" << p.outCN;
+    if (p.groups > 1)
+       *os << ", G=" << p.groups;
+    if (((Size)p.stride).area() != 1)
+        *os << ", S=" << ((Size)p.stride);
+    if (((Size)p.dilation).area() != 1)
+        *os << ", D=" << ((Size)p.dilation);
+    if (((Size)p.pad).area() != 0)
+        *os << ", P=" << ((Size)p.pad);
+    if (((Size)p.padAdjust).area() != 0)
+        *os << ", PAdj=" << ((Size)p.padAdjust);
+    if (!((std::string)p.padMode).empty())
+        *os << ", PM=" << ((std::string)p.padMode);
+    if (p.hasBias)
+        *os << ", BIAS";
 }

-PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
-    Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
-    Values(make_pair(blobShape(1,   4, 224, 224),  64),
-           make_pair(blobShape(1,  64, 112, 122), 128),
-           make_pair(blobShape(1, 256,  28,  28), 512)),
-    GroupSize::all(),
-    StrideSize::all())
-)
+
+
+typedef tuple<ConvParamID, tuple<Backend, Target> > ConvTestParam_t;
+typedef TestBaseWithParam<ConvTestParam_t> Conv;
+
+PERF_TEST_P_(Conv, conv)
 {
-    RNG rng(0);
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, ConvParamID::CONV_LAST);
+    const ConvParam_t& params = testConvolutionConfigs[test_id];
+    double declared_flops = params.declared_flops;
+    Size kernel = params.kernel;
+    MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 4);
+    int outChannels = params.outCN;
+    int groups = params.groups;
+    Size stride = params.stride;
+    Size dilation = params.dilation;
+    Size pad = params.pad;
+    Size padAdjust = params.padAdjust;
+    std::string padMode(params.padMode);
+    bool hasBias = params.hasBias;
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));

-    ConvParam params = GetParam();
-    int ksz     = get<0>(params).width;
-    MatShape inpShape = get<1>(params).first;
-    int outCn   = get<1>(params).second;
-    int groups  = get<2>(params);
-    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
+    int inChannels = inputShape[1];
+    Size inSize(inputShape[3], inputShape[2]);

-    int inpCn = inpShape[1];
-    int wgtSize[] = { outCn, inpCn/groups, ksz, ksz };
-    int biasSize[] = { outCn, 1, 1, 1 };
-    const int wtype = CV_32F;
-    Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype);
-    Mat inpBlob(4, &inpShape[0], wtype);
-    rng.fill(biasBlob, RNG::UNIFORM, -1, +1);
-    rng.fill(wgtBlob, RNG::UNIFORM, -1, +1);
-    rng.fill(inpBlob, RNG::UNIFORM, -1, +1);
+    int sz[] = {outChannels, inChannels / groups, kernel.height, kernel.width};
+    Mat weights(4, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);

    LayerParams lp;
-    lp.set("num_output", outCn);
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("pad_w", pad.width);
+    lp.set("pad_h", pad.height);
+    if (padAdjust.width > 0 || padAdjust.height > 0)
+    {
+        lp.set("adj_w", padAdjust.width);
+        lp.set("adj_h", padAdjust.height);
+    }
+    if (!padMode.empty())
+        lp.set("pad_mode", padMode);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.set("dilation_w", dilation.width);
+    lp.set("dilation_h", dilation.height);
+    lp.set("num_output", outChannels);
    lp.set("group", groups);
-    lp.set("stride", stride);
-    lp.set("kernel_size", ksz);
-    lp.blobs.reserve(2);
-    lp.blobs.push_back(wgtBlob);
-    lp.blobs.push_back(biasBlob);
-
-    std::vector<Mat*> inpBlobs(1, &inpBlob);
-    std::vector<Mat> outBlobs, internalBlobs;
-
-    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
-    std::vector<MatShape> inputShapes(1, shape(inpBlob)), outShapes, internals;
-    layer->getMemoryShapes(inputShapes, 0, outShapes, internals);
-    for (size_t i = 0; i < outShapes.size(); i++)
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+    if (hasBias)
    {
-        outBlobs.push_back(Mat(outShapes[i], CV_32F));
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
    }
-    for (size_t i = 0; i < internals.size(); i++)
+    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &inpSz[0], CV_32F);
+    randu(input, -1.0f, 1.0f);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    MatShape netInputShape = shape(input);
+    size_t weightsMemory = 0, blobsMemory = 0;
+    net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+    int64 flops = net.getFLOPS(netInputShape);
+    CV_Assert(flops > 0);
+
+    std::cout
+        << "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
+        << "    OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
+        << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
+        << "    MFLOPS=" << flops * 1e-6 << std::endl;
+
+    TEST_CYCLE()
    {
-        internalBlobs.push_back(Mat());
-        if (total(internals[i]))
-            internalBlobs.back().create(internals[i], CV_32F);
+        Mat res = net.forward();
    }

-    layer->finalize(inpBlobs, outBlobs);
-
-    Mat inpBlob2D = inpBlob.reshape(1, outCn);
-    Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
-    Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]);
-    declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D);
-
-    layer->forward(inpBlobs, outBlobs, internalBlobs); /// warmup
-
-    PERF_SAMPLE_BEGIN()
-        layer->forward(inpBlobs, outBlobs, internalBlobs);
-    PERF_SAMPLE_END()
-
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
    SANITY_CHECK_NOTHING();
 }

+INSTANTIATE_TEST_CASE_P(/**/, Conv, Combine(
+    ConvParamID::all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
 } // namespace
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -14,10 +14,7 @@

 namespace opencv_test {

-CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, DNN_BACKEND_OPENCV)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD)
-
-class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<DNNBackend, DNNTarget> >
+class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target> >
 {
 public:
    dnn::Backend backend;
@ -269,22 +266,6 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
               Mat(cv::Size(800, 600), CV_32FC3));
 }

-const tuple<DNNBackend, DNNTarget> testCases[] = {
-#ifdef HAVE_HALIDE
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
-#endif
-#ifdef HAVE_INF_ENGINE
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
-#endif
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
-};
-
-INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
+INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());

 } // namespace
--- a/modules/dnn/perf/perf_precomp.hpp
+++ b/modules/dnn/perf/perf_precomp.hpp
@ -4,6 +4,8 @@
 #include <opencv2/ts.hpp>
 #include <opencv2/dnn.hpp>

+#include "../test/test_common.hpp"
+
 namespace opencv_test {
 using namespace perf;
 using namespace cv::dnn;
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1676,14 +1676,6 @@ struct Net::Impl
            // with the current layer if they follow it. Normally, the are fused with the convolution layer,
            // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
            // some other layers.
-
-            // TODO: OpenCL target support more fusion styles.
-            if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
-                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
-                 ld.layerInstance->type != "Concat")) )
-                continue;
-
            Ptr<Layer>& currLayer = ld.layerInstance;
            if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
            {
@ -1717,6 +1709,13 @@ struct Net::Impl
                if (preferableBackend != DNN_BACKEND_OPENCV)
                    continue;  // Go to the next layer.

+                // TODO: OpenCL target support more fusion styles.
+                if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                     (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
+                     ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
+                     ld.layerInstance->type != "Concat")) )
+                    continue;
+
                while (nextData)
                {
                    // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
@ -2693,8 +2692,7 @@ void Net::setInput(InputArray blob, const String& name, double scalefactor, cons
 Mat Net::getParam(LayerId layer, int numParam)
 {
    LayerData &ld = impl->getLayerData(layer);
-
-    std::vector<Mat> &layerBlobs = ld.layerInstance->blobs;
+    std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
    CV_Assert(numParam < (int)layerBlobs.size());
    return layerBlobs[numParam];
 }
@ -2703,7 +2701,7 @@ void Net::setParam(LayerId layer, int numParam, const Mat &blob)
 {
    LayerData &ld = impl->getLayerData(layer);

-    std::vector<Mat> &layerBlobs = ld.layerInstance->blobs;
+    std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
    CV_Assert(numParam < (int)layerBlobs.size());
    //we don't make strong checks, use this function carefully
    layerBlobs[numParam] = blob;
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -350,12 +350,14 @@ public:
        return false;
    }

-    void fuseWeights(const Mat& w, const Mat& b)
+    void fuseWeights(const Mat& w_, const Mat& b_)
    {
        // Convolution weights have OIHW data layout. Parameters fusion in case of
        // (conv(I) + b1 ) * w + b2
        // means to replace convolution's weights to [w*conv(I)] and bias to [b1 * w + b2]
        const int outCn = weightsMat.size[0];
+        Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at<float>(0))) : w_;
+        Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at<float>(0))) : b_;
        CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2,
                    w.empty() || outCn == w.total(), b.empty() || outCn == b.total());

--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@ -41,6 +41,7 @@
 //M*/

 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
 #include "layers_common.hpp"

 namespace cv
@ -64,6 +65,12 @@ public:
        }
    }

+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && crop_ranges.size() == 4;
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -109,7 +116,11 @@ public:
                offset_final[i] = offset[i - start_axis];
        }

-        crop_ranges.resize(dims, Range::all());
+        crop_ranges.resize(dims);
+        for (int i = 0; i < start_axis; i++)
+        {
+            crop_ranges[i] = Range(0, inpBlob.size[i]);
+        }
        for (int i = start_axis; i < dims; i++)
        {
            if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i])
@ -138,6 +149,38 @@ public:
        input(&crop_ranges[0]).copyTo(output);
    }

+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Crop";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::CropLayer> ieLayer(new InferenceEngine::CropLayer(lp));
+
+        CV_Assert(crop_ranges.size() == 4);
+
+        ieLayer->axis.push_back(0);  // batch
+        ieLayer->offset.push_back(crop_ranges[0].start);
+        ieLayer->dim.push_back(crop_ranges[0].end - crop_ranges[0].start);
+
+        ieLayer->axis.push_back(1);  // channels
+        ieLayer->offset.push_back(crop_ranges[1].start);
+        ieLayer->dim.push_back(crop_ranges[1].end - crop_ranges[1].start);
+
+        ieLayer->axis.push_back(3);  // height
+        ieLayer->offset.push_back(crop_ranges[2].start);
+        ieLayer->dim.push_back(crop_ranges[2].end - crop_ranges[2].start);
+
+        ieLayer->axis.push_back(2);  // width
+        ieLayer->offset.push_back(crop_ranges[3].start);
+        ieLayer->dim.push_back(crop_ranges[3].end - crop_ranges[3].start);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
    std::vector<Range> crop_ranges;
 };

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -161,6 +161,16 @@ public:
        return Ptr<BackendNode>();
    }

+    virtual bool tryFuse(Ptr<dnn::Layer>& top) CV_OVERRIDE
+    {
+        return func.tryFuse(top);
+    }
+
+    void getScaleShift(Mat& scale_, Mat& shift_) const CV_OVERRIDE
+    {
+        func.getScaleShift(scale_, shift_);
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -343,6 +353,10 @@ struct ReLUFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 1; }
 };

@ -448,6 +462,10 @@ struct ReLU6Functor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 2; }
 };

@ -518,6 +536,10 @@ struct TanHFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 1; }
 };

@ -588,6 +610,10 @@ struct SigmoidFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 3; }
 };

@ -659,6 +685,10 @@ struct ELUFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 2; }
 };

@ -727,6 +757,10 @@ struct AbsValFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 1; }
 };

@ -775,6 +809,10 @@ struct BNLLFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 5; }
 };

@ -875,15 +913,51 @@ struct PowerFunctor
 #ifdef HAVE_INF_ENGINE
    InferenceEngine::CNNLayerPtr initInfEngine(InferenceEngine::LayerParams& lp)
    {
-        lp.type = "Power";
-        std::shared_ptr<InferenceEngine::PowerLayer> ieLayer(new InferenceEngine::PowerLayer(lp));
-        ieLayer->power = power;
-        ieLayer->scale = scale;
-        ieLayer->offset = shift;
-        return ieLayer;
+        if (power == 1.0f && scale == 1.0f && shift == 0.0f)
+        {
+            // It looks like there is a bug in Inference Engine for DNN_TARGET_OPENCL and DNN_TARGET_OPENCL_FP16
+            // if power layer do nothing so we replace it to Identity.
+            lp.type = "Split";
+            return std::shared_ptr<InferenceEngine::SplitLayer>(new InferenceEngine::SplitLayer(lp));
+        }
+        else
+        {
+            lp.type = "Power";
+            std::shared_ptr<InferenceEngine::PowerLayer> ieLayer(new InferenceEngine::PowerLayer(lp));
+            ieLayer->power = power;
+            ieLayer->scale = scale;
+            ieLayer->offset = shift;
+            return ieLayer;
+        }
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>& top)
+    {
+        if (power != 1.0f && shift != 0.0f)
+            return false;
+
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if ((w.empty() && b.empty()) || w.total() > 1 || b.total() > 1)
+            return false;
+
+        float nextScale = w.empty() ? 1.0f : w.at<float>(0);
+        float nextShift = b.empty() ? 0.0f : b.at<float>(0);
+        scale = std::pow(scale, power) * nextScale;
+        shift = nextScale * shift + nextShift;
+        return true;
+    }
+
+    void getScaleShift(Mat& _scale, Mat& _shift) const
+    {
+        if (power == 1.0f)
+        {
+            _scale = Mat(1, 1, CV_32F, Scalar(scale));
+            _shift = Mat(1, 1, CV_32F, Scalar(shift));
+        }
+    }
+
    int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; }
 };

@ -989,6 +1063,10 @@ struct ChannelsPReLUFunctor
    }
 #endif  // HAVE_INF_ENGINE

+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
    int64 getFLOPSPerElement() const { return 1; }
 };

--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -83,12 +83,6 @@ public:
        int startAxis = clamp(_startAxis, numAxes);
        int endAxis = clamp(_endAxis, numAxes);

-        for (size_t i = 1; i < inputs.size(); i++)
-        {
-            CV_Assert(inputs[i] == inputs[0]);
-        }
-
-
        CV_Assert(startAxis >= 0);
        CV_Assert(endAxis >= startAxis && endAxis < (int)numAxes);

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -350,17 +350,33 @@ public:
            inshape = shape(outerSize, innerSize);
            outshape = shape(outerSize, numOutput);

-            UMat srcMat, dstMat;
+            UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);

-            cv::gemm(srcMat, weights, 1, noArray(), 0, dstMat, GEMM_2_T);
+            if (use_half)
+            {
+                convertFp16(srcMat, srcMat_fp32);
+                convertFp16(dstMat, dstMat_fp32);
+            }
+            else
+            {
+                srcMat_fp32 = srcMat;
+                dstMat_fp32 = dstMat;
+            }
+
+            cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);

            if (bias)
            {
                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
                UMat& biases = umat_blobs[1];
-                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+                cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
+            }
+            if (use_half)
+            {
+                convertFp16(srcMat_fp32, srcMat);
+                convertFp16(dstMat_fp32, dstMat);
            }
        }

--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -453,8 +453,8 @@ public:
        outputPtr = outputs[0].ptr<float>(0, 1);
        if(_variance.size() == 1)
        {
-            Mat secondChannel(outputs[0].size[2], outputs[0].size[3], CV_32F, outputPtr);
-            secondChannel.setTo(Scalar(_variance[0]));
+            Mat secondChannel(1, outputs[0].size[2], CV_32F, outputPtr);
+            secondChannel.setTo(Scalar::all(_variance[0]));
        }
        else
        {
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -161,6 +161,7 @@ InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net)
    inputs = net.getInputsInfo();
    outputs = net.getOutputsInfo();
    layers.resize(net.layerCount());  // A hack to execute InfEngineBackendNet::layerCount correctly.
+    netOwner = net;
 }

 void InfEngineBackendNet::Release() noexcept
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -131,6 +131,8 @@ private:
    InferenceEngine::InferencePlugin plugin;
    InferenceEngine::ExecutableNetwork netExec;
    InferenceEngine::InferRequest infRequest;
+    // In case of models from Model Optimizer we need to manage their lifetime.
+    InferenceEngine::CNNNetwork netOwner;

    std::string name;

--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@ -782,6 +782,108 @@ void releaseTensor(tensorflow::TensorProto* tensor)
    }
 }

+static void permute(google::protobuf::RepeatedPtrField<tensorflow::NodeDef>* data,
+                    const std::vector<int>& indices)
+{
+    const int num = data->size();
+    CV_Assert(num == indices.size());
+
+    std::vector<int> elemIdToPos(num);
+    std::vector<int> posToElemId(num);
+    for (int i = 0; i < num; ++i)
+    {
+        elemIdToPos[i] = i;
+        posToElemId[i] = i;
+    }
+    for (int i = 0; i < num; ++i)
+    {
+        int elemId = indices[i];
+        int pos = elemIdToPos[elemId];
+        if (pos != i)
+        {
+            data->SwapElements(i, pos);
+            const int swappedElemId = posToElemId[i];
+            elemIdToPos[elemId] = i;
+            elemIdToPos[swappedElemId] = pos;
+
+            posToElemId[i] = elemId;
+            posToElemId[pos] = swappedElemId;
+        }
+    }
+}
+
+// Is based on tensorflow::graph_transforms::SortByExecutionOrder
+void sortByExecutionOrder(tensorflow::GraphDef& net)
+{
+    // Maps node's name to index at net.node() list.
+    std::map<std::string, int> nodesMap;
+    std::map<std::string, int>::iterator nodesMapIt;
+    for (int i = 0; i < net.node_size(); ++i)
+    {
+        const tensorflow::NodeDef& node = net.node(i);
+        nodesMap.insert(std::make_pair(node.name(), i));
+    }
+
+    // Indices of nodes which use specific node as input.
+    std::vector<std::vector<int> > edges(nodesMap.size());
+    std::vector<int> numRefsToAdd(nodesMap.size(), 0);
+    std::vector<int> nodesToAdd;
+    for (int i = 0; i < net.node_size(); ++i)
+    {
+        const tensorflow::NodeDef& node = net.node(i);
+        for (int j = 0; j < node.input_size(); ++j)
+        {
+            std::string inpName = node.input(j);
+            inpName = inpName.substr(0, inpName.rfind(':'));
+            inpName = inpName.substr(inpName.find('^') + 1);
+
+            nodesMapIt = nodesMap.find(inpName);
+            CV_Assert(nodesMapIt != nodesMap.end());
+            edges[nodesMapIt->second].push_back(i);
+        }
+        if (node.input_size() == 0)
+            nodesToAdd.push_back(i);
+        else
+        {
+            if (node.op() == "Merge" || node.op() == "RefMerge")
+            {
+                int numControlEdges = 0;
+                for (int j = 0; j < node.input_size(); ++j)
+                    numControlEdges += node.input(j)[0] == '^';
+                numRefsToAdd[i] = numControlEdges + 1;
+            }
+            else
+                numRefsToAdd[i] = node.input_size();
+        }
+    }
+
+    std::vector<int> permIds;
+    permIds.reserve(net.node_size());
+    while (!nodesToAdd.empty())
+    {
+        int nodeToAdd = nodesToAdd.back();
+        nodesToAdd.pop_back();
+
+        permIds.push_back(nodeToAdd);
+        // std::cout << net.node(nodeToAdd).name() << '\n';
+
+        for (int i = 0; i < edges[nodeToAdd].size(); ++i)
+        {
+            int consumerId = edges[nodeToAdd][i];
+            if (numRefsToAdd[consumerId] > 0)
+            {
+                if (numRefsToAdd[consumerId] == 1)
+                    nodesToAdd.push_back(consumerId);
+                else
+                    CV_Assert(numRefsToAdd[consumerId] >= 0);
+                numRefsToAdd[consumerId] -= 1;
+            }
+        }
+    }
+    CV_Assert(permIds.size() == net.node_size());
+    permute(net.mutable_node(), permIds);
+}
+
 CV__DNN_INLINE_NS_END
 }}  // namespace dnn, namespace cv

--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp
@ -25,6 +25,8 @@ Mat getTensorContent(const tensorflow::TensorProto &tensor);

 void releaseTensor(tensorflow::TensorProto* tensor);

+void sortByExecutionOrder(tensorflow::GraphDef& net);
+
 CV__DNN_INLINE_NS_END
 }}  // namespace dnn, namespace cv

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -1950,5 +1950,34 @@ Net readNetFromTensorflow(const std::vector<uchar>& bufferModel, const std::vect
                                 bufferConfigPtr, bufferConfig.size());
 }

+void writeTextGraph(const String& _model, const String& output)
+{
+    String model = _model;
+    const std::string modelExt = model.substr(model.rfind('.') + 1);
+    if (modelExt != "pb")
+        CV_Error(Error::StsNotImplemented, "Only TensorFlow models support export to text file");
+
+    tensorflow::GraphDef net;
+    ReadTFNetParamsFromBinaryFileOrDie(model.c_str(), &net);
+
+    sortByExecutionOrder(net);
+
+    RepeatedPtrField<tensorflow::NodeDef>::iterator it;
+    for (it = net.mutable_node()->begin(); it != net.mutable_node()->end(); ++it)
+    {
+        if (it->op() == "Const")
+        {
+            it->mutable_attr()->at("value").mutable_tensor()->clear_tensor_content();
+        }
+    }
+
+    std::string content;
+    google::protobuf::TextFormat::PrintToString(net, &content);
+
+    std::ofstream ofs(output.c_str());
+    ofs << content;
+    ofs.close();
+}
+
 CV__DNN_INLINE_NS_END
 }} // namespace
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -161,7 +161,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
-    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0;
    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0;
    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt",
@ -173,7 +173,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
-    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : 0.0;
    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.062 : 0.0;
    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "dnn/ssd_mobilenet_v2_coco_2018_03_29.pbtxt",
@ -247,8 +247,8 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
    if (backend == DNN_BACKEND_HALIDE)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
-    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
-    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.008 : 0.0;
+    Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.015 : 0.0;
    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0731 : 0.0;
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
               inp, "detection_out", "", l1, lInf);
@ -285,21 +285,6 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
 }

-const tuple<Backend, Target> testCases[] = {
-#ifdef HAVE_HALIDE
-    tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
-    tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
-#endif
-#ifdef HAVE_INF_ENGINE
-    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-    tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
-#endif
-    tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-    tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
-};
-
-INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));
+INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false));

 }} // namespace
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -417,7 +417,7 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
    float l1 = default_l1, lInf = default_lInf;
    if (target == DNN_TARGET_OPENCL_FP16)
    {
-        l1 = 0.017; lInf = 0.067;
+        l1 = 0.017; lInf = 0.0795;
    }
    else if (target == DNN_TARGET_MYRIAD)
    {
@ -490,8 +490,7 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,

 TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 {
-    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("");
    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
                                           0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
@ -502,8 +501,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 TEST_P(Test_Caffe_nets, FasterRCNN_zf)
 {
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
        throw SkipTestException("");
    static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
                                           0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
@ -514,12 +512,13 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
 TEST_P(Test_Caffe_nets, RFCN)
 {
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) ||
-        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) ||
-        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
        throw SkipTestException("");
+    double scoreDiff = (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) ? 4e-3 : default_l1;
+    double iouDiff = (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) ? 8e-2 : default_lInf;
    static Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
                                           0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
-    testFaster("rfcn_pascal_voc_resnet50.prototxt", "resnet50_rfcn_final.caffemodel", ref);
+    testFaster("rfcn_pascal_voc_resnet50.prototxt", "resnet50_rfcn_final.caffemodel", ref, scoreDiff, iouDiff);
 }

 INSTANTIATE_TEST_CASE_P(/**/, Test_Caffe_nets, dnnBackendsAndTargets());
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@ -42,6 +42,47 @@
 #ifndef __OPENCV_TEST_COMMON_HPP__
 #define __OPENCV_TEST_COMMON_HPP__

+#ifdef HAVE_OPENCL
+#include "opencv2/core/ocl.hpp"
+#endif
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+static inline void PrintTo(const cv::dnn::Backend& v, std::ostream* os)
+{
+    switch (v) {
+    case DNN_BACKEND_DEFAULT: *os << "DEFAULT"; return;
+    case DNN_BACKEND_HALIDE: *os << "HALIDE"; return;
+    case DNN_BACKEND_INFERENCE_ENGINE: *os << "DLIE"; return;
+    case DNN_BACKEND_OPENCV: *os << "OCV"; return;
+    } // don't use "default:" to emit compiler warnings
+    *os << "DNN_BACKEND_UNKNOWN(" << v << ")";
+}
+
+static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os)
+{
+    switch (v) {
+    case DNN_TARGET_CPU: *os << "CPU"; return;
+    case DNN_TARGET_OPENCL: *os << "OCL"; return;
+    case DNN_TARGET_OPENCL_FP16: *os << "OCL_FP16"; return;
+    case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return;
+    } // don't use "default:" to emit compiler warnings
+    *os << "DNN_TARGET_UNKNOWN(" << v << ")";
+}
+
+using opencv_test::tuple;
+using opencv_test::get;
+static inline void PrintTo(const tuple<cv::dnn::Backend, cv::dnn::Target> v, std::ostream* os)
+{
+    PrintTo(get<0>(v), os);
+    *os << "/";
+    PrintTo(get<1>(v), os);
+}
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+
 static inline const std::string &getOpenCVExtraDir()
 {
    return cvtest::TS::ptr()->get_data_path();
@ -190,4 +231,54 @@ static inline bool readFileInMemory(const std::string& filename, std::string& co
    return true;
 }

+namespace opencv_test {
+
+using namespace cv::dnn;
+
+static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargets(
+        bool withInferenceEngine = true,
+        bool withHalide = false,
+        bool withCpuOCV = true
+)
+{
+    std::vector<tuple<Backend, Target> > targets;
+#ifdef HAVE_HALIDE
+    if (withHalide)
+    {
+        targets.push_back(make_tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
+#ifdef HAVE_OPENCL
+        if (cv::ocl::useOpenCL())
+            targets.push_back(make_tuple(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
+#endif
+    }
+#endif
+#ifdef HAVE_INF_ENGINE
+    if (withInferenceEngine)
+    {
+        targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU));
+#ifdef HAVE_OPENCL
+        if (cv::ocl::useOpenCL())
+        {
+            targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL));
+            targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
+        }
+#endif
+        if (checkMyriadTarget())
+            targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
+    }
+#endif
+    if (withCpuOCV)
+        targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
+#ifdef HAVE_OPENCL
+    if (cv::ocl::useOpenCL())
+    {
+        targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
+        targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
+    }
+#endif
+    return testing::ValuesIn(targets);
+}
+
+} // namespace
+
 #endif
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -44,23 +44,9 @@ static void test(LayerParams& params, Mat& input, Backend backendId, Target targ
    test(input, net, backendId, targetId, skipCheck);
 }

-static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
+static inline testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
 {
-    static const tuple<Backend, Target> testCases[] = {
-#ifdef HAVE_HALIDE
-        tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
-        tuple<Backend, Target>(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL),
-#endif
-#ifdef HAVE_INF_ENGINE
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
-#endif
-        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
-    };
-    return testing::ValuesIn(testCases);
+    return dnnBackendsAndTargets(true, true, false); // OpenCV/CPU is used as reference
 }

 class Test_Halide_layers : public DNNTestLayer {};
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@ -177,10 +177,6 @@ TEST_P(DNNTestOpenVINO, models)
    Target target = (dnn::Target)(int)get<0>(GetParam());
    std::string modelName = get<1>(GetParam());

-    if ((modelName == "semantic-segmentation-adas-0001" && target == DNN_TARGET_OPENCL_FP16) ||
-        (modelName == "vehicle-license-plate-detection-barrier-0106"))
-        throw SkipTestException("");
-
    std::string precision = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "FP16" : "FP32";
    std::string prefix = utils::fs::join("intel_models",
                         utils::fs::join(modelName,
--- a/modules/dnn/test/test_precomp.hpp
+++ b/modules/dnn/test/test_precomp.hpp
@ -49,35 +49,6 @@
 #include "opencv2/dnn.hpp"
 #include "test_common.hpp"

-namespace cv {
-namespace dnn {
-CV__DNN_INLINE_NS_BEGIN
-
-static inline void PrintTo(const cv::dnn::Backend& v, std::ostream* os)
-{
-    switch (v) {
-    case DNN_BACKEND_DEFAULT: *os << "DNN_BACKEND_DEFAULT"; return;
-    case DNN_BACKEND_HALIDE: *os << "DNN_BACKEND_HALIDE"; return;
-    case DNN_BACKEND_INFERENCE_ENGINE: *os << "DNN_BACKEND_INFERENCE_ENGINE"; return;
-    case DNN_BACKEND_OPENCV: *os << "DNN_BACKEND_OPENCV"; return;
-    } // don't use "default:" to emit compiler warnings
-    *os << "DNN_BACKEND_UNKNOWN(" << v << ")";
-}
-
-static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os)
-{
-    switch (v) {
-    case DNN_TARGET_CPU: *os << "DNN_TARGET_CPU"; return;
-    case DNN_TARGET_OPENCL: *os << "DNN_TARGET_OPENCL"; return;
-    case DNN_TARGET_OPENCL_FP16: *os << "DNN_TARGET_OPENCL_FP16"; return;
-    case DNN_TARGET_MYRIAD: *os << "DNN_TARGET_MYRIAD"; return;
-    } // don't use "default:" to emit compiler warnings
-    *os << "DNN_TARGET_UNKNOWN(" << v << ")";
-}
-
-CV__DNN_INLINE_NS_END
-}} // namespace
-
 namespace opencv_test {
 using namespace cv::dnn;

@ -95,22 +66,6 @@ static testing::internal::ParamGenerator<Target> availableDnnTargets()
    return testing::ValuesIn(targets);
 }

-static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargets()
-{
-    static const tuple<Backend, Target> testCases[] = {
-    #ifdef HAVE_INF_ENGINE
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
-        tuple<Backend, Target>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD),
-    #endif
-        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_CPU),
-        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL),
-        tuple<Backend, Target>(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16)
-    };
-    return testing::ValuesIn(testCases);
-}
-
 class DNNTestLayer : public TestWithParam<tuple<Backend, Target> >
 {
 public:
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -296,7 +296,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)

    Net net = readNetFromTensorflow(model, proto);
    Mat img = imread(findDataFile("dnn/street.png", false));
-    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
+    Mat blob = blobFromImage(img, 1.0f, Size(300, 300), Scalar(), true, false);

    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
@ -310,32 +310,61 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
                                    0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1;
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0097 : default_l1;
    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.09 : default_lInf;
    normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
 }

-TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
+TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
 {
    checkBackend();
+
+    std::string model = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false);
+    std::string proto = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt", false);
+
+    Net net = readNetFromTensorflow(model, proto);
+    Mat img = imread(findDataFile("dnn/dog416.png", false));
+    Mat blob = blobFromImage(img, 1.0f, Size(300, 300), Scalar(), true, false);
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    net.setInput(blob);
+    Mat out = net.forward();
+
+    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
+    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1e-5;
+    float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.0098 : 1e-3;
+    normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_TensorFlow_nets, Faster_RCNN)
+{
+    static std::string names[] = {"faster_rcnn_inception_v2_coco_2018_01_28",
+                                  "faster_rcnn_resnet50_coco_2018_01_28"};
+
+    checkBackend();
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");

-    std::string proto = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
-    std::string model = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false);
+    for (int i = 1; i < 2; ++i)
+    {
+        std::string proto = findDataFile("dnn/" + names[i] + ".pbtxt", false);
+        std::string model = findDataFile("dnn/" + names[i] + ".pb", false);

-    Net net = readNetFromTensorflow(model, proto);
-    net.setPreferableBackend(backend);
-    net.setPreferableTarget(target);
-    Mat img = imread(findDataFile("dnn/dog416.png", false));
-    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(800, 600), Scalar(127.5, 127.5, 127.5), true, false);
+        Net net = readNetFromTensorflow(model, proto);
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+        Mat img = imread(findDataFile("dnn/dog416.png", false));
+        Mat blob = blobFromImage(img, 1.0f, Size(800, 600), Scalar(), true, false);

-    net.setInput(blob);
-    Mat out = net.forward();
+        net.setInput(blob);
+        Mat out = net.forward();

-    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/faster_rcnn_inception_v2_coco_2018_01_28.detection_out.npy"));
-    normAssertDetections(ref, out, "", 0.3);
+        Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/" + names[i] + ".detection_out.npy"));
+        normAssertDetections(ref, out, names[i].c_str(), 0.3);
+    }
 }

 TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
@ -347,15 +376,17 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
    Net net = readNetFromTensorflow(model, proto);
    Mat img = imread(findDataFile("dnn/dog416.png", false));
    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy", false));
-    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
+    Mat blob = blobFromImage(img, 1.0f, Size(300, 300), Scalar(), true, false);

    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);

    net.setInput(blob);
    Mat out = net.forward();
-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : default_l1;
-    normAssertDetections(ref, out, "", 0.4, scoreDiff, default_lInf);
+
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.011 : default_l1;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.021 : default_lInf;
+    normAssertDetections(ref, out, "", 0.4, scoreDiff, iouDiff);
 }

 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -301,14 +301,14 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
    // Due to numerical instability in Pooling-Unpooling layers (indexes jittering)
    // thresholds for ENet must be changed. Accuracy of results was checked on
    // Cityscapes dataset and difference in mIOU with Torch is 10E-4%
-    normAssert(ref, out, "", 0.00044, 0.44);
+    normAssert(ref, out, "", 0.00044, target == DNN_TARGET_CPU ? 0.453 : 0.44);

    const int N = 3;
    for (int i = 0; i < N; i++)
    {
        net.setInput(inputBlob, "");
        Mat out = net.forward();
-        normAssert(ref, out, "", 0.00044, 0.44);
+        normAssert(ref, out, "", 0.00044, target == DNN_TARGET_CPU ? 0.453 : 0.44);
    }
 }

--- a/modules/imgcodecs/src/grfmt_webp.cpp
+++ b/modules/imgcodecs/src/grfmt_webp.cpp
@ -54,15 +54,21 @@

 #include "opencv2/imgproc.hpp"

-const size_t WEBP_HEADER_SIZE = 32;
+#include <opencv2/core/utils/configuration.private.hpp>

 namespace cv
 {

+// 64Mb limit to avoid memory DDOS
+static size_t param_maxFileSize = utils::getConfigurationParameterSizeT("OPENCV_IMGCODECS_WEBP_MAX_FILE_SIZE", 64*1024*1024);
+
+static const size_t WEBP_HEADER_SIZE = 32;
+
 WebPDecoder::WebPDecoder()
 {
    m_buf_supported = true;
    channels = 0;
+    fs_size = 0;
 }

 WebPDecoder::~WebPDecoder() {}
@ -96,48 +102,29 @@ ImageDecoder WebPDecoder::newDecoder() const

 bool WebPDecoder::readHeader()
 {
+    uint8_t header[WEBP_HEADER_SIZE] = { 0 };
    if (m_buf.empty())
    {
-        FILE * wfile = NULL;
+        fs.open(m_filename.c_str(), std::ios::binary);
+        fs.seekg(0, std::ios::end);
+        fs_size = safeCastToSizeT(fs.tellg(), "File is too large");
+        fs.seekg(0, std::ios::beg);
+        CV_Assert(fs && "File stream error");
+        CV_CheckGE(fs_size, WEBP_HEADER_SIZE, "File is too small");
+        CV_CheckLE(fs_size, param_maxFileSize, "File is too large. Increase OPENCV_IMGCODECS_WEBP_MAX_FILE_SIZE parameter if you want to process large files");

-        wfile = fopen(m_filename.c_str(), "rb");
-
-        if(wfile == NULL)
-        {
-            return false;
-        }
-
-        fseek(wfile, 0, SEEK_END);
-        long int wfile_size = ftell(wfile);
-        fseek(wfile, 0, SEEK_SET);
-
-        if(wfile_size > static_cast<long int>(INT_MAX))
-        {
-            fclose(wfile);
-            return false;
-        }
-
-        data.create(1, (int)wfile_size, CV_8U);
-
-        size_t data_size = fread(data.ptr(), 1, wfile_size, wfile);
-
-        if(wfile)
-        {
-            fclose(wfile);
-        }
-
-        if(static_cast<long int>(data_size) != wfile_size)
-        {
-            return false;
-        }
+        fs.read((char*)header, sizeof(header));
+        CV_Assert(fs && "Can't read WEBP_HEADER_SIZE bytes");
    }
    else
    {
+        CV_CheckGE(m_buf.total(), WEBP_HEADER_SIZE, "Buffer is too small");
+        memcpy(header, m_buf.ptr(), sizeof(header));
        data = m_buf;
    }

    WebPBitstreamFeatures features;
-    if(VP8_STATUS_OK == WebPGetFeatures(data.ptr(), WEBP_HEADER_SIZE, &features))
+    if (VP8_STATUS_OK == WebPGetFeatures(header, sizeof(header), &features))
    {
        m_width  = features.width;
        m_height = features.height;
@ -161,41 +148,75 @@ bool WebPDecoder::readHeader()

 bool WebPDecoder::readData(Mat &img)
 {
-    if( m_width > 0 && m_height > 0 )
-    {
-        bool convert_grayscale = (img.type() == CV_8UC1); // IMREAD_GRAYSCALE requested
+    CV_CheckGE(m_width, 0, ""); CV_CheckGE(m_height, 0, "");

-        if (img.cols != m_width || img.rows != m_height || img.type() != m_type)
+    CV_CheckEQ(img.cols, m_width, "");
+    CV_CheckEQ(img.rows, m_height, "");
+
+    if (m_buf.empty())
+    {
+        fs.seekg(0, std::ios::beg); CV_Assert(fs && "File stream error");
+        data.create(1, validateToInt(fs_size), CV_8UC1);
+        fs.read((char*)data.ptr(), fs_size);
+        CV_Assert(fs && "Can't read file data");
+        fs.close();
+    }
+    CV_Assert(data.type() == CV_8UC1); CV_Assert(data.rows == 1);
+
+    {
+        Mat read_img;
+        CV_CheckType(img.type(), img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4, "");
+        if (img.type() != m_type)
        {
-            img.create(m_height, m_width, m_type);
+            read_img.create(m_height, m_width, m_type);
+        }
+        else
+        {
+            read_img = img;  // copy header
        }

-        uchar* out_data = img.ptr();
-        size_t out_data_size = img.cols * img.rows * img.elemSize();
+        uchar* out_data = read_img.ptr();
+        size_t out_data_size = read_img.dataend - out_data;

-        uchar *res_ptr = 0;
+        uchar *res_ptr = NULL;
        if (channels == 3)
        {
+            CV_CheckTypeEQ(read_img.type(), CV_8UC3, "");
            res_ptr = WebPDecodeBGRInto(data.ptr(), data.total(), out_data,
-                                        (int)out_data_size, (int)img.step);
+                                        (int)out_data_size, (int)read_img.step);
        }
        else if (channels == 4)
        {
+            CV_CheckTypeEQ(read_img.type(), CV_8UC4, "");
            res_ptr = WebPDecodeBGRAInto(data.ptr(), data.total(), out_data,
-                                         (int)out_data_size, (int)img.step);
+                                         (int)out_data_size, (int)read_img.step);
        }

-        if(res_ptr == out_data)
+        if (res_ptr != out_data)
+            return false;
+
+        if (read_img.data == img.data && img.type() == m_type)
        {
-            if (convert_grayscale)
-            {
-                cvtColor(img, img, COLOR_BGR2GRAY);
-            }
-            return true;
+            // nothing
+        }
+        else if (img.type() == CV_8UC1)
+        {
+            cvtColor(read_img, img, COLOR_BGR2GRAY);
+        }
+        else if (img.type() == CV_8UC3 && m_type == CV_8UC4)
+        {
+            cvtColor(read_img, img, COLOR_BGRA2BGR);
+        }
+        else if (img.type() == CV_8UC3 && m_type == CV_8UC4)
+        {
+            cvtColor(read_img, img, COLOR_BGRA2BGR);
+        }
+        else
+        {
+            CV_Error(Error::StsInternal, "");
        }
    }
-
-    return false;
+    return true;
 }

 WebPEncoder::WebPEncoder()
@ -213,12 +234,9 @@ ImageEncoder WebPEncoder::newEncoder() const

 bool WebPEncoder::write(const Mat& img, const std::vector<int>& params)
 {
-    int channels = img.channels(), depth = img.depth();
-    int width = img.cols, height = img.rows;
+    CV_CheckDepthEQ(img.depth(), CV_8U, "WebP codec supports 8U images only");

-    const Mat *image = &img;
-    Mat temp;
-    size_t size = 0;
+    const int width = img.cols, height = img.rows;

    bool comp_lossless = true;
    float quality = 100.0f;
@ -240,69 +258,64 @@ bool WebPEncoder::write(const Mat& img, const std::vector<int>& params)
        }
    }

-    uint8_t *out = NULL;
+    int channels = img.channels();
+    CV_Check(channels, channels == 1 || channels == 3 || channels == 4, "");

-    if(depth != CV_8U)
-    {
-        return false;
-    }
+    const Mat *image = &img;
+    Mat temp;

-    if(channels == 1)
+    if (channels == 1)
    {
        cvtColor(*image, temp, CV_GRAY2BGR);
        image = &temp;
        channels = 3;
    }
-    else if (channels == 2)
-    {
-        return false;
-    }

+    uint8_t *out = NULL;
+    size_t size = 0;
    if (comp_lossless)
    {
-        if(channels == 3)
+        if (channels == 3)
        {
            size = WebPEncodeLosslessBGR(image->ptr(), width, height, (int)image->step, &out);
        }
-        else if(channels == 4)
+        else if (channels == 4)
        {
            size = WebPEncodeLosslessBGRA(image->ptr(), width, height, (int)image->step, &out);
        }
    }
    else
    {
-        if(channels == 3)
+        if (channels == 3)
        {
            size = WebPEncodeBGR(image->ptr(), width, height, (int)image->step, quality, &out);
        }
-        else if(channels == 4)
+        else if (channels == 4)
        {
            size = WebPEncodeBGRA(image->ptr(), width, height, (int)image->step, quality, &out);
        }
    }
+#if WEBP_DECODER_ABI_VERSION >= 0x0206
+    Ptr<uint8_t> out_cleaner(out, WebPFree);
+#else
+    Ptr<uint8_t> out_cleaner(out, free);
+#endif

-    if(size > 0)
+    CV_Assert(size > 0);
+
+    if (m_buf)
    {
-        if(m_buf)
-        {
-            m_buf->resize(size);
-            memcpy(&(*m_buf)[0], out, size);
-        }
-        else
-        {
-            FILE *fd = fopen(m_filename.c_str(), "wb");
-            if(fd != NULL)
-            {
-                fwrite(out, size, sizeof(uint8_t), fd);
-                fclose(fd); fd = NULL;
-            }
-        }
+        m_buf->resize(size);
+        memcpy(&(*m_buf)[0], out, size);
    }
-
-    if (out != NULL)
+    else
    {
-        free(out);
-        out = NULL;
+        FILE *fd = fopen(m_filename.c_str(), "wb");
+        if (fd != NULL)
+        {
+            fwrite(out, size, sizeof(uint8_t), fd);
+            fclose(fd); fd = NULL;
+        }
    }

    return size > 0;
--- a/modules/imgcodecs/src/grfmt_webp.hpp
+++ b/modules/imgcodecs/src/grfmt_webp.hpp
@ -47,7 +47,7 @@

 #ifdef HAVE_WEBP

-
+#include <fstream>

 namespace cv
 {
@ -61,7 +61,6 @@ public:

    bool readData( Mat& img ) CV_OVERRIDE;
    bool readHeader() CV_OVERRIDE;
-    void close();

    size_t signatureLength() const CV_OVERRIDE;
    bool checkSignature( const String& signature) const CV_OVERRIDE;
@ -69,6 +68,8 @@ public:
    ImageDecoder newDecoder() const CV_OVERRIDE;

 protected:
+    std::ifstream fs;
+    size_t fs_size;
    Mat data;
    int channels;
 };
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -400,6 +400,8 @@ static void ApplyExifOrientation(const Mat& buf, Mat& img)
 static void*
 imread_( const String& filename, int flags, int hdrtype, Mat* mat=0 )
 {
+    CV_Assert(mat || hdrtype != LOAD_MAT); // mat is required in LOAD_MAT case
+
    IplImage* image = 0;
    CvMat *matrix = 0;
    Mat temp, *data = &temp;
@ -711,11 +713,22 @@ static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,

    encoder->setDestination( filename );
    CV_Assert(params.size() <= CV_IO_MAX_IMAGE_PARAMS*2);
-    bool code;
-    if (!isMultiImg)
-        code = encoder->write( write_vec[0], params );
-    else
-        code = encoder->writemulti( write_vec, params ); //to be implemented
+    bool code = false;
+    try
+    {
+        if (!isMultiImg)
+            code = encoder->write( write_vec[0], params );
+        else
+            code = encoder->writemulti( write_vec, params ); //to be implemented
+    }
+    catch (const cv::Exception& e)
+    {
+        std::cerr << "imwrite_('" << filename << "'): can't write data: " << e.what() << std::endl << std::flush;
+    }
+    catch (...)
+    {
+        std::cerr << "imwrite_('" << filename << "'): can't write data: unknown exception" << std::endl << std::flush;
+    }

    //    CV_Assert( code );
    return code;
--- a/modules/imgcodecs/src/utils.hpp
+++ b/modules/imgcodecs/src/utils.hpp
@ -44,6 +44,15 @@

 int validateToInt(size_t step);

+template <typename _Tp> static inline
+size_t safeCastToSizeT(const _Tp v_origin, const char* msg)
+{
+    const size_t value_cast = (size_t)v_origin;
+    if ((_Tp)value_cast != v_origin)
+        CV_Error(cv::Error::StsError, msg ? msg : "Can't cast value into size_t");
+    return value_cast;
+}
+
 struct PaletteEntry
 {
    unsigned char b, g, r, a;
--- a/modules/imgcodecs/test/test_webp.cpp
+++ b/modules/imgcodecs/test/test_webp.cpp
@ -96,12 +96,17 @@ TEST(Imgcodecs_WebP, encode_decode_with_alpha_webp)
    string output = cv::tempfile(".webp");

    EXPECT_NO_THROW(cv::imwrite(output, img));
-    cv::Mat img_webp = cv::imread(output);
+    cv::Mat img_webp = cv::imread(output, IMREAD_UNCHANGED);
+    cv::Mat img_webp_bgr = cv::imread(output); // IMREAD_COLOR by default
    EXPECT_EQ(0, remove(output.c_str()));
    EXPECT_FALSE(img_webp.empty());
    EXPECT_EQ(4,   img_webp.channels());
    EXPECT_EQ(512, img_webp.cols);
    EXPECT_EQ(512, img_webp.rows);
+    EXPECT_FALSE(img_webp_bgr.empty());
+    EXPECT_EQ(3,   img_webp_bgr.channels());
+    EXPECT_EQ(512, img_webp_bgr.cols);
+    EXPECT_EQ(512, img_webp_bgr.rows);
 }

 #endif // HAVE_WEBP
--- a/modules/imgproc/misc/java/test/Subdiv2DTest.java
+++ b/modules/imgproc/misc/java/test/Subdiv2DTest.java
@ -52,7 +52,7 @@ public class Subdiv2DTest extends OpenCVTestCase {
        s2d.insert( new Point(10, 20) );
        MatOfFloat6 triangles = new MatOfFloat6();
        s2d.getTriangleList(triangles);
-        assertEquals(10, triangles.rows());
+        assertEquals(2, triangles.rows());
        /*
        int cnt = triangles.rows();
        float buff[] = new float[cnt*6];
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@ -332,7 +332,7 @@ void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, &mask, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size;

@ -430,7 +430,7 @@ void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _m
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, &mask, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size;

@ -533,7 +533,7 @@ void cv::accumulateProduct( InputArray _src1, InputArray _src2,
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src1, &src2, &dst, &mask, 0};
-    uchar* ptrs[4]{};
+    uchar* ptrs[4] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size;

@ -635,7 +635,7 @@ void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &dst, &mask, 0};
-    uchar* ptrs[3]{};
+    uchar* ptrs[3] = {};
    NAryMatIterator it(arrays, ptrs);
    int len = (int)it.size;

--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@ -1123,7 +1123,6 @@ cvFindNextContour( CvContourScanner scanner )
 #endif
            {
                _CvContourInfo *par_info = 0;
-                _CvContourInfo *l_cinfo = 0;
                CvSeq *seq = 0;
                int is_hole = 0;
                CvPoint origin;
@ -1215,6 +1214,7 @@ cvFindNextContour( CvContourScanner scanner )
                seq->flags |= is_hole ? CV_SEQ_FLAG_HOLE : 0;

                /* initialize header */
+                _CvContourInfo *l_cinfo = 0;
                if( mode <= 1 )
                {
                    l_cinfo = &(scanner->cinfo_temp);
@ -1225,10 +1225,8 @@ cvFindNextContour( CvContourScanner scanner )
                }
                else
                {
-                    union { _CvContourInfo* ci; CvSetElem* se; } v;
-                    v.ci = l_cinfo;
-                    cvSetAdd( scanner->cinfo_set, 0, &v.se );
-                    l_cinfo = v.ci;
+                    cvSetAdd(scanner->cinfo_set, 0, (CvSetElem**)&l_cinfo);
+                    CV_Assert(l_cinfo);
                    int lval;

                    if( img_i )
@ -1298,16 +1296,16 @@ cvFindNextContour( CvContourScanner scanner )
                scanner->img = (schar *) img;
                scanner->nbd = nbd;
                return l_cinfo->contour;
-
-            resume_scan:
-
+            }
+        resume_scan:
+            {
                prev = p;
                /* update lnbd */
                if( prev & -2 )
                {
                    lnbd.x = x;
                }
-            }                   /* end of prev != p */
+            }
        }                       /* end of loop on x */

        lnbd.x = 0;
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@ -45,7 +45,8 @@ namespace cv
 {

 static const int DIST_SHIFT = 16;
-static const int INIT_DIST0 = (INT_MAX >> 2);
+static const int INIT_DIST0 = INT_MAX;
+static const int DIST_MAX   = (INT_MAX >> 2);
 #define  CV_FLT_TO_FIX(x,n)  cvRound((x)*(1<<(n)))

 static void
@ -71,8 +72,8 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
 {
    const int BORDER = 1;
    int i, j;
-    const int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
-    const int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
+    const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
+    const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
    const float scale = 1.f/(1 << DIST_SHIFT);

    const uchar* src = _src.ptr();
@ -89,7 +90,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
    for( i = 0; i < size.height; i++ )
    {
        const uchar* s = src + i*srcstep;
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;

        for( j = 0; j < BORDER; j++ )
            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
@ -100,8 +101,8 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                tmp[j] = 0;
            else
            {
-                int t0 = tmp[j-step-1] + DIAG_DIST;
-                int t = tmp[j-step] + HV_DIST;
+                unsigned int t0 = tmp[j-step-1] + DIAG_DIST;
+                unsigned int t = tmp[j-step] + HV_DIST;
                if( t0 > t ) t0 = t;
                t = tmp[j-step+1] + DIAG_DIST;
                if( t0 > t ) t0 = t;
@ -116,14 +117,14 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
    for( i = size.height - 1; i >= 0; i-- )
    {
        float* d = (float*)(dist + i*dststep);
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;

        for( j = size.width - 1; j >= 0; j-- )
        {
-            int t0 = tmp[j];
+            unsigned int t0 = tmp[j];
            if( t0 > HV_DIST )
            {
-                int t = tmp[j+step+1] + DIAG_DIST;
+                unsigned int t = tmp[j+step+1] + DIAG_DIST;
                if( t0 > t ) t0 = t;
                t = tmp[j+step] + HV_DIST;
                if( t0 > t ) t0 = t;
@ -133,6 +134,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                if( t0 > t ) t0 = t;
                tmp[j] = t0;
            }
+            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
            d[j] = (float)(t0 * scale);
        }
    }
@ -144,9 +146,9 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
 {
    const int BORDER = 2;
    int i, j;
-    const int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
-    const int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
-    const int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
+    const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
+    const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
    const float scale = 1.f/(1 << DIST_SHIFT);

    const uchar* src = _src.ptr();
@ -163,7 +165,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
    for( i = 0; i < size.height; i++ )
    {
        const uchar* s = src + i*srcstep;
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;

        for( j = 0; j < BORDER; j++ )
            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
@ -174,8 +176,8 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                tmp[j] = 0;
            else
            {
-                int t0 = tmp[j-step*2-1] + LONG_DIST;
-                int t = tmp[j-step*2+1] + LONG_DIST;
+                unsigned int t0 = tmp[j-step*2-1] + LONG_DIST;
+                unsigned int t = tmp[j-step*2+1] + LONG_DIST;
                if( t0 > t ) t0 = t;
                t = tmp[j-step-2] + LONG_DIST;
                if( t0 > t ) t0 = t;
@ -198,14 +200,14 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
    for( i = size.height - 1; i >= 0; i-- )
    {
        float* d = (float*)(dist + i*dststep);
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;

        for( j = size.width - 1; j >= 0; j-- )
        {
-            int t0 = tmp[j];
+            unsigned int t0 = tmp[j];
            if( t0 > HV_DIST )
            {
-                int t = tmp[j+step*2+1] + LONG_DIST;
+                unsigned int t = tmp[j+step*2+1] + LONG_DIST;
                if( t0 > t ) t0 = t;
                t = tmp[j+step*2-1] + LONG_DIST;
                if( t0 > t ) t0 = t;
@ -223,6 +225,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                if( t0 > t ) t0 = t;
                tmp[j] = t0;
            }
+            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
            d[j] = (float)(t0 * scale);
        }
    }
@ -235,9 +238,9 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
    const int BORDER = 2;

    int i, j;
-    const int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
-    const int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
-    const int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
+    const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
+    const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
    const float scale = 1.f/(1 << DIST_SHIFT);

    const uchar* src = _src.ptr();
@ -247,7 +250,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
    int srcstep = (int)(_src.step/sizeof(src[0]));
    int step = (int)(_temp.step/sizeof(temp[0]));
    int dststep = (int)(_dist.step/sizeof(dist[0]));
-    int lstep = (int)(_labels.step/sizeof(dist[0]));
+    int lstep = (int)(_labels.step/sizeof(labels[0]));
    Size size = _src.size();

    initTopBottom( _temp, BORDER );
@ -256,7 +259,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
    for( i = 0; i < size.height; i++ )
    {
        const uchar* s = src + i*srcstep;
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
        int* lls = (int*)(labels + i*lstep);

        for( j = 0; j < BORDER; j++ )
@ -271,7 +274,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
            }
            else
            {
-                int t0 = INIT_DIST0, t;
+                unsigned int t0 = INIT_DIST0, t;
                int l0 = 0;

                t = tmp[j-step*2-1] + LONG_DIST;
@ -333,16 +336,16 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
    for( i = size.height - 1; i >= 0; i-- )
    {
        float* d = (float*)(dist + i*dststep);
-        int* tmp = (int*)(temp + (i+BORDER)*step) + BORDER;
+        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
        int* lls = (int*)(labels + i*lstep);

        for( j = size.width - 1; j >= 0; j-- )
        {
-            int t0 = tmp[j];
+            unsigned int t0 = tmp[j];
            int l0 = lls[j];
            if( t0 > HV_DIST )
            {
-                int t = tmp[j+step*2+1] + LONG_DIST;
+                unsigned int t = tmp[j+step*2+1] + LONG_DIST;
                if( t0 > t )
                {
                    t0 = t;
@ -393,6 +396,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
                tmp[j] = t0;
                lls[j] = l0;
            }
+            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
            d[j] = (float)(t0 * scale);
        }
    }
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -340,51 +340,199 @@ static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_mi
    hline<ET, FT, n, mulall, cncnt>::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
 };

+#if CV_SIMD512
+inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint16(
+                 *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])),
+                 *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])),
+                 *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])),
+                 *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
+                 *((uint16_t*)(src + ofst[16])), *((uint16_t*)(src + ofst[17])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
+                 *((uint16_t*)(src + ofst[20])), *((uint16_t*)(src + ofst[21])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
+                 *((uint16_t*)(src + ofst[24])), *((uint16_t*)(src + ofst[25])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
+                 *((uint16_t*)(src + ofst[28])), *((uint16_t*)(src + ofst[29])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))),
+             v_src0, v_src1);
+}
+inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint32(
+                 *((uint32_t*)(src + 2 * ofst[ 0])), *((uint32_t*)(src + 2 * ofst[ 1])), *((uint32_t*)(src + 2 * ofst[ 2])), *((uint32_t*)(src + 2 * ofst[ 3])),
+                 *((uint32_t*)(src + 2 * ofst[ 4])), *((uint32_t*)(src + 2 * ofst[ 5])), *((uint32_t*)(src + 2 * ofst[ 6])), *((uint32_t*)(src + 2 * ofst[ 7])),
+                 *((uint32_t*)(src + 2 * ofst[ 8])), *((uint32_t*)(src + 2 * ofst[ 9])), *((uint32_t*)(src + 2 * ofst[10])), *((uint32_t*)(src + 2 * ofst[11])),
+                 *((uint32_t*)(src + 2 * ofst[12])), *((uint32_t*)(src + 2 * ofst[13])), *((uint32_t*)(src + 2 * ofst[14])), *((uint32_t*)(src + 2 * ofst[15])))),
+             v_src0, v_src1);
+    v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
+    v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
+}
+inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint64(
+                 *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])),
+                 *((uint64_t*)(src + 4 * ofst[4])), *((uint64_t*)(src + 4 * ofst[5])), *((uint64_t*)(src + 4 * ofst[6])), *((uint64_t*)(src + 4 * ofst[7])))),
+             v_src0, v_src1);
+    v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
+    v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
+    v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1);
+}
+inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
+{
+    v_expand(v_reinterpret_as_u16(v_uint32(
+                 *((uint32_t*)(src + ofst[ 0])), *((uint32_t*)(src + ofst[ 1])), *((uint32_t*)(src + ofst[ 2])), *((uint32_t*)(src + ofst[ 3])),
+                 *((uint32_t*)(src + ofst[ 4])), *((uint32_t*)(src + ofst[ 5])), *((uint32_t*)(src + ofst[ 6])), *((uint32_t*)(src + ofst[ 7])),
+                 *((uint32_t*)(src + ofst[ 8])), *((uint32_t*)(src + ofst[ 9])), *((uint32_t*)(src + ofst[10])), *((uint32_t*)(src + ofst[11])),
+                 *((uint32_t*)(src + ofst[12])), *((uint32_t*)(src + ofst[13])), *((uint32_t*)(src + ofst[14])), *((uint32_t*)(src + ofst[15])))),
+             v_src0, v_src1);
+    v_uint32 v_tmp0, v_tmp1;
+    v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+    v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+}
+#elif CV_SIMD256
+inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint16(
+                 *((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])),
+                 *((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])),
+                 *((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])),
+                 *((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))),
+             v_src0, v_src1);
+}
+inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint32(
+                 *((uint32_t*)(src + 2 * ofst[0])), *((uint32_t*)(src + 2 * ofst[1])), *((uint32_t*)(src + 2 * ofst[2])), *((uint32_t*)(src + 2 * ofst[3])),
+                 *((uint32_t*)(src + 2 * ofst[4])), *((uint32_t*)(src + 2 * ofst[5])), *((uint32_t*)(src + 2 * ofst[6])), *((uint32_t*)(src + 2 * ofst[7])))),
+             v_src0, v_src1);
+    v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
+    v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
+    v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1);
+}
+inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_expand(v_reinterpret_as_u8(v_uint64(
+                 *((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])))),
+             v_src0, v_src1);
+    v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
+    v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
+}
+inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
+{
+    v_uint32 v_tmp0, v_tmp1;
+    v_expand(v_reinterpret_as_u16(v_uint32(
+                 *((uint32_t*)(src + ofst[0])), *((uint32_t*)(src + ofst[1])), *((uint32_t*)(src + ofst[2])), *((uint32_t*)(src + ofst[3])),
+                 *((uint32_t*)(src + ofst[4])), *((uint32_t*)(src + ofst[5])), *((uint32_t*)(src + ofst[6])), *((uint32_t*)(src + ofst[7])))),
+             v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+    v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+}
+#elif CV_SIMD128
+inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    uint16_t buf[8];
+    buf[0] = *((uint16_t*)(src + ofst[0]));
+    buf[1] = *((uint16_t*)(src + ofst[1]));
+    buf[2] = *((uint16_t*)(src + ofst[2]));
+    buf[3] = *((uint16_t*)(src + ofst[3]));
+    buf[4] = *((uint16_t*)(src + ofst[4]));
+    buf[5] = *((uint16_t*)(src + ofst[5]));
+    buf[6] = *((uint16_t*)(src + ofst[6]));
+    buf[7] = *((uint16_t*)(src + ofst[7]));
+    v_src0 = vx_load_expand((uint8_t*)buf);
+    v_src1 = vx_load_expand((uint8_t*)buf + 8);
+}
+inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    uint32_t buf[4];
+    buf[0] = *((uint32_t*)(src + 2 * ofst[0]));
+    buf[1] = *((uint32_t*)(src + 2 * ofst[1]));
+    buf[2] = *((uint32_t*)(src + 2 * ofst[2]));
+    buf[3] = *((uint32_t*)(src + 2 * ofst[3]));
+    v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
+    v_tmp0 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf));
+    v_tmp1 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf + 8));
+    v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
+    v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
+    v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
+}
+inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
+{
+    v_uint16 v_tmp0, v_tmp1;
+    v_src0 = vx_load_expand(src + 4 * ofst[0]);
+    v_src1 = vx_load_expand(src + 4 * ofst[1]);
+    v_recombine(v_src0, v_src1, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+}
+inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
+{
+    uint32_t buf[4];
+    buf[0] = *((uint32_t*)(src + ofst[0]));
+    buf[1] = *((uint32_t*)(src + ofst[1]));
+    buf[2] = *((uint32_t*)(src + ofst[2]));
+    buf[3] = *((uint32_t*)(src + ofst[3]));
+    v_src0 = vx_load_expand((uint16_t*)buf);
+    v_src1 = vx_load_expand((uint16_t*)buf + 4);
+    v_uint32 v_tmp0, v_tmp1;
+    v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
+    v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
+}
+#endif
 template <>
 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
 {
    int i = 0;
    ufixedpoint16 src_0(src[0]);
-    v_uint16x8 v_src_0 = v_setall_u16(*((uint16_t*)&src_0));
-    for (; i < dst_min - 7; i += 8, m += 16, dst += 8) // Points that fall left from src image so became equal to leftmost src point
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
+    for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint16_t*)dst, v_src_0);
    }
+#endif
    for (; i < dst_min; i++, m += 2)
    {
        *(dst++) = src_0;
    }
-    for (; i < dst_max - 7 && ofst[i + 7] + 15 <= ofst[dst_width - 1]; i += 8, m += 16, dst += 8)
+#if CV_SIMD
+    for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
    {
-        v_uint32x4 v_src01 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i    ])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 1])));
-        v_uint32x4 v_src23 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 3])));
-        v_uint32x4 v_src45 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 4])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 5])));
-        v_uint32x4 v_src67 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 6])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 7])));
+        v_uint16 v_src0, v_src1;
+        v_load_indexed1(src, ofst + i, v_src0, v_src1);

-        v_uint32x4 v_zip02, v_zip13, v_zip46, v_zip57;
-        v_zip(v_src01, v_src23, v_zip02, v_zip13);
-        v_zip(v_src45, v_src67, v_zip46, v_zip57);
-
-        v_uint32x4 v_src0, v_src1;
-        v_zip(v_combine_low(v_zip02, v_zip46), v_combine_low(v_zip13, v_zip57), v_src0, v_src1);
-
-        v_int16x8 v_mul0 = v_load((int16_t*)m);
-        v_int16x8 v_mul1 = v_load((int16_t*)m + 8);
-        v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0));
-        v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1));
+        v_int16 v_mul0 = vx_load((int16_t*)m);
+        v_int16 v_mul1 = vx_load((int16_t*)m + VECSZ);
+        v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0));
+        v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1));
        v_store((uint16_t*)dst, v_pack(v_res0, v_res1));
    }
+#endif
    for (; i < dst_max; i += 1, m += 2)
    {
        uint8_t* px = src + ofst[i];
        *(dst++) = m[0] * px[0] + m[1] * px[1];
    }
    src_0 = (src + ofst[dst_width - 1])[0];
-    v_src_0 = v_setall_u16(*((uint16_t*)&src_0));
-    for (; i < dst_width - 7; i += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point
+#if CV_SIMD
+    v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
+    for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint16_t*)dst, v_src_0);
    }
+    vx_cleanup();
+#endif
    for (; i < dst_width; i++)
    {
        *(dst++) = src_0;
@ -394,87 +542,109 @@ template <>
 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
 {
    int i = 0;
-    ufixedpoint16 srccn[8] = { src[0], src[1], src[0], src[1], src[0], src[1], src[0], src[1] };
-    v_uint16x8 v_srccn = v_load((uint16_t*)srccn);
-    for (; i < dst_min - 3; i += 4, m += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point
+    union {
+        uint32_t d;
+        uint16_t w[2];
+    } srccn;
+    ((ufixedpoint16*)(srccn.w))[0] = src[0];
+    ((ufixedpoint16*)(srccn.w))[1] = src[1];
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
+    for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint16_t*)dst, v_srccn);
    }
+#endif
    for (; i < dst_min; i++, m += 2)
    {
-        *(dst++) = srccn[0];
-        *(dst++) = srccn[1];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
    }
-    for (; i < dst_max - 3 && ofst[i + 3] + 7 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 8)
+#if CV_SIMD
+    for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
    {
-        v_uint32x4 v_src0 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i    ])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 1])));
-        v_uint32x4 v_src1 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 3])));
+        v_uint16 v_src0, v_src1;
+        v_load_indexed2(src, ofst + i, v_src0, v_src1);

-        v_uint32x4 v_zip0, v_zip1;
-        v_zip(v_src0, v_src1, v_zip0, v_zip1);
-        v_zip(v_zip0, v_zip1, v_src0, v_src1);
-
-        v_int16x8 v_src0123, v_src4567;
-        v_zip(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_src1), v_src0123, v_src4567);
-
-        v_uint32x4 v_mul = v_load((uint32_t*)m);//AaBbCcDd
+        v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd
+        v_uint32 v_zip0, v_zip1;
        v_zip(v_mul, v_mul, v_zip0, v_zip1);//AaAaBbBb CcCcDdDd
-        v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src0123, v_reinterpret_as_s16(v_zip0)));
-        v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src4567, v_reinterpret_as_s16(v_zip1)));
+        v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_zip0)));
+        v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_zip1)));
        v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
    }
+#endif
    for (; i < dst_max; i += 1, m += 2)
    {
        uint8_t* px = src + 2 * ofst[i];
        *(dst++) = m[0] * px[0] + m[1] * px[2];
        *(dst++) = m[0] * px[1] + m[1] * px[3];
    }
-    srccn[0] = (src + 2 * ofst[dst_width - 1])[0]; srccn[1] = (src + 2 * ofst[dst_width - 1])[1]; srccn[2] = (src + 2 * ofst[dst_width - 1])[0]; srccn[3] = (src + 2 * ofst[dst_width - 1])[1];
-    srccn[4] = (src + 2 * ofst[dst_width - 1])[0]; srccn[5] = (src + 2 * ofst[dst_width - 1])[1]; srccn[6] = (src + 2 * ofst[dst_width - 1])[0]; srccn[7] = (src + 2 * ofst[dst_width - 1])[1];
-    v_srccn = v_load((uint16_t*)srccn);
-    for (; i < dst_width - 3; i += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point
+    ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1];
+#if CV_SIMD
+    v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
+    for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint16_t*)dst, v_srccn);
    }
+    vx_cleanup();
+#endif
    for (; i < dst_width; i++)
    {
-        *(dst++) = srccn[0];
-        *(dst++) = srccn[1];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
    }
 }
 template <>
 void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
 {
    int i = 0;
-    ufixedpoint16 srccn[8] = { src[0], src[1], src[2], src[3], src[0], src[1], src[2], src[3] };
-    v_uint16x8 v_srccn = v_load((uint16_t*)srccn);
-    for (; i < dst_min - 1; i += 2, m += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point
+    union {
+        uint64_t q;
+        uint16_t w[4];
+    } srccn;
+    ((ufixedpoint16*)(srccn.w))[0] = src[0];
+    ((ufixedpoint16*)(srccn.w))[1] = src[1];
+    ((ufixedpoint16*)(srccn.w))[2] = src[2];
+    ((ufixedpoint16*)(srccn.w))[3] = src[3];
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
+    for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint16_t*)dst, v_srccn);
    }
+#endif
    if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point
    {
-        *(dst++) = srccn[0];
-        *(dst++) = srccn[1];
-        *(dst++) = srccn[2];
-        *(dst++) = srccn[3];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
        i++; m += 2;
    }
-    for (; i < dst_max - 1 && ofst[i + 1] + 3 <= ofst[dst_width - 1]; i += 2, m += 4, dst += 8)
+#if CV_SIMD
+    for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
    {
-        v_int16x8 v_src01 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i    ]));
-        v_int16x8 v_src23 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i + 1]));
+        v_uint16 v_src0, v_src1, v_src2, v_src3;
+        v_load_indexed4(src, ofst + i, v_src0, v_src1);
+        v_load_indexed4(src, ofst + i + VECSZ/4, v_src2, v_src3);

-        v_int16x8 v_tmp0, v_tmp1;
-        v_recombine(v_src01, v_src23, v_tmp0, v_tmp1);
-        v_zip(v_tmp0, v_tmp1, v_src01, v_src23);
+        v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
+        v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
+        v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
+        v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
+        v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd

-        v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0]));//AaAaAaAa
-        v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[1]));//BbBbBbBb
-        v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src01, v_mul01));
-        v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src23, v_mul23));
-        v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
+        v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
+        v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
+        v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
+        v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
+        v_store((uint16_t*)dst        , v_pack(v_res0, v_res1));
+        v_store((uint16_t*)dst + VECSZ, v_pack(v_res2, v_res3));
    }
+#endif
    for (; i < dst_max; i += 1, m += 2)
    {
        uint8_t* px = src + 4 * ofst[i];
@ -483,19 +653,22 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
        *(dst++) = m[0] * px[2] + m[1] * px[6];
        *(dst++) = m[0] * px[3] + m[1] * px[7];
    }
-    srccn[0] = (src + 4 * ofst[dst_width - 1])[0]; srccn[1] = (src + 4 * ofst[dst_width - 1])[1]; srccn[2] = (src + 4 * ofst[dst_width - 1])[2]; srccn[3] = (src + 4 * ofst[dst_width - 1])[3];
-    srccn[4] = (src + 4 * ofst[dst_width - 1])[0]; srccn[5] = (src + 4 * ofst[dst_width - 1])[1]; srccn[6] = (src + 4 * ofst[dst_width - 1])[2]; srccn[7] = (src + 4 * ofst[dst_width - 1])[3];
-    v_srccn = v_load((uint16_t*)srccn);
-    for (; i < dst_width - 1; i += 2, dst += 8) // Points that fall right from src image so became equal to rightmost src point
+    ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1];
+    ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3];
+#if CV_SIMD
+    v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
+    for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point
    {
        v_store((uint16_t*)dst, v_srccn);
    }
+    vx_cleanup();
+#endif
    if (i < dst_width)
    {
-        *(dst++) = srccn[0];
-        *(dst++) = srccn[1];
-        *(dst++) = srccn[2];
-        *(dst++) = srccn[3];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
+        *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
    }
 }
 template <>
@ -503,40 +676,42 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
 {
    int i = 0;
    ufixedpoint32 src_0(src[0]);
-    v_uint32x4 v_src_0 = v_setall_u32(*((uint32_t*)&src_0));
-    for (; i < dst_min - 3; i += 4, m += 8, dst += 4) // Points that fall left from src image so became equal to leftmost src point
+#if CV_SIMD
+    const int VECSZ = v_uint32::nlanes;
+    v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
+    for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
    {
        v_store((uint32_t*)dst, v_src_0);
    }
+#endif
    for (; i < dst_min; i++, m += 2)
    {
        *(dst++) = src_0;
    }
-    for (; i < dst_max - 3 && ofst[i + 3] + 8 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 4)
+#if CV_SIMD
+    for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
    {
-        v_uint32x4 v_src0 = v_combine_low(v_load_expand(src + ofst[i]), v_load_expand(src + ofst[i + 1]));
-        v_uint32x4 v_mul0 = v_load((uint32_t*)m);
-        v_uint32x4 v_src1 = v_combine_low(v_load_expand(src + ofst[i + 2]), v_load_expand(src + ofst[i + 3]));
-        v_uint32x4 v_mul1 = v_load((uint32_t*)m + 4);
-        v_uint32x4 v_res0 = v_src0 * v_mul0;//a1a2b1b2
-        v_uint32x4 v_res1 = v_src1 * v_mul1;//c1c2d1d2
-        v_uint32x4 v_tmp0, v_tmp1;
-        v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1a2c1c2 b1b2d1d2
-        v_zip(v_tmp0, v_tmp1, v_res0, v_res1);//a1b1a2b2 c1d1c2d2
-        v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1b1c1d1 a2b2c2d2
-        v_store((uint32_t*)dst, v_tmp0 + v_tmp1);//abcd
+        v_uint32 v_src0, v_src1;
+        v_load_indexed_deinterleave(src, ofst + i, v_src0, v_src1);
+        v_uint32 v_mul0, v_mul1;
+        v_load_deinterleave((uint32_t*)m, v_mul0, v_mul1);
+        v_store((uint32_t*)dst, v_src0 * v_mul0 + v_src1 * v_mul1);//abcd
    }
+#endif
    for (; i < dst_max; i += 1, m += 2)
    {
        uint16_t* px = src + ofst[i];
        *(dst++) = m[0] * px[0] + m[1] * px[1];
    }
    src_0 = (src + ofst[dst_width - 1])[0];
-    v_src_0 = v_setall_u32(*((uint32_t*)&src_0));
-    for (; i < dst_width - 3; i += 4, dst += 4)
+#if CV_SIMD
+    v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
+    for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ)
    {
        v_store((uint32_t*)dst, v_src_0);
    }
+    vx_cleanup();
+#endif
    for (; i < dst_width; i++)
    {
        *(dst++) = src_0;
@ -552,18 +727,22 @@ void vlineSet(FT* src, ET* dst, int dst_width)
 template <>
 void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
 {
-    static const v_uint16x8 v_fixedRound = v_setall_u16((uint16_t)((1U << 8) >> 1));
    int i = 0;
-    for (; i < dst_width - 15; i += 16, src += 16, dst += 16)
+#if CV_SIMD
+    const int VECSZ = v_uint8::nlanes;
+    static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
+    for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
    {
-        v_uint16x8 v_src0 = v_load((uint16_t*)src);
-        v_uint16x8 v_src1 = v_load((uint16_t*)src + 8);
+        v_uint16 v_src0 = vx_load((uint16_t*)src);
+        v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2);

-        v_uint16x8 v_res0 = (v_src0 + v_fixedRound) >> 8;
-        v_uint16x8 v_res1 = (v_src1 + v_fixedRound) >> 8;
+        v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8;
+        v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8;

        v_store(dst, v_pack(v_res0, v_res1));
    }
+    vx_cleanup();
+#endif
    for (; i < dst_width; i++)
        *(dst++) = *(src++);
 }
@ -582,36 +761,40 @@ void vlineResize(FT* src, size_t src_step, FT* m, ET* dst, int dst_width)
 template <>
 void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step, ufixedpoint16* m, uint8_t* dst, int dst_width)
 {
-    static const v_int32x4 v_fixedRound = v_setall_s32((int32_t)((1 << 16) >> 1));
-    static const v_int16x8 v_128    = v_reinterpret_as_s16(v_setall_u16((uint16_t)1<<15));
-    static const v_int8x16 v_128_16 = v_reinterpret_as_s8 (v_setall_u8 ((uint8_t) 1<<7));
-
    int i = 0;
    ufixedpoint16* src1 = src + src_step;
-    v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0]));
-    for (; i < dst_width - 15; i += 16, src += 16, src1 += 16, dst += 16)
+#if CV_SIMD
+    const int VECSZ = v_uint8::nlanes;
+    static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
+    static const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
+    static const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
+
+    v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0]));
+    for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ)
    {
-        v_int16x8 v_src00 = v_load((int16_t*)src);
-        v_int16x8 v_src10 = v_load((int16_t*)src1);
-        v_int16x8 v_tmp0, v_tmp1;
+        v_int16 v_src00 = vx_load((int16_t*)src);
+        v_int16 v_src10 = vx_load((int16_t*)src1);
+        v_int16 v_tmp0, v_tmp1;
        v_zip(v_add_wrap(v_src00,v_128), v_add_wrap(v_src10,v_128), v_tmp0, v_tmp1);

-        v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res0 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res1 = v_dotprod(v_tmp1, v_mul);

-        v_int16x8 v_src01 = v_load((int16_t*)src + 8);
-        v_int16x8 v_src11 = v_load((int16_t*)src1 + 8);
+        v_int16 v_src01 = vx_load((int16_t*)src + VECSZ/2);
+        v_int16 v_src11 = vx_load((int16_t*)src1 + VECSZ/2);
        v_zip(v_add_wrap(v_src01,v_128), v_add_wrap(v_src11,v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);

-        v_int8x16 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
-                                        (v_res1 + v_fixedRound) >> 16),
-                                 v_pack((v_res2 + v_fixedRound) >> 16,
-                                        (v_res3 + v_fixedRound) >> 16));
+        v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
+                                     (v_res1 + v_fixedRound) >> 16),
+                              v_pack((v_res2 + v_fixedRound) >> 16,
+                                     (v_res3 + v_fixedRound) >> 16));

        v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
    }
+    vx_cleanup();
+#endif
    for (; i < dst_width; i++)
    {
        *(dst++) = (uint8_t)(*(src++) * m[0] + *(src1++) * m[1]);
--- a/modules/imgproc/src/segmentation.cpp
+++ b/modules/imgproc/src/segmentation.cpp
@ -407,27 +407,25 @@ void cv::pyrMeanShiftFiltering( InputArray _src, OutputArray _dst,
        cv::Size size = src.size();
        const uchar* sptr = src.ptr();
        int sstep = (int)src.step;
-        uchar* mask = 0;
-        int mstep = 0;
        uchar* dptr;
        int dstep;
        float sp = (float)(sp0 / (1 << level));
        sp = MAX( sp, 1 );

+        cv::Mat m;
        if( level < max_level )
        {
            cv::Size size1 = dst_pyramid[level+1].size();
-            cv::Mat m( size.height, size.width, CV_8UC1, mask0.ptr() );
+            m = cv::Mat(size.height, size.width, CV_8UC1, mask0.ptr());
            dstep = (int)dst_pyramid[level+1].step;
            dptr = dst_pyramid[level+1].ptr() + dstep + cn;
-            mstep = (int)m.step;
-            mask = m.ptr() + mstep;
            //cvResize( dst_pyramid[level+1], dst_pyramid[level], CV_INTER_CUBIC );
            cv::pyrUp( dst_pyramid[level+1], dst_pyramid[level], dst_pyramid[level].size() );
            m.setTo(cv::Scalar::all(0));

-            for( i = 1; i < size1.height-1; i++, dptr += dstep - (size1.width-2)*3, mask += mstep*2 )
+            for( i = 1; i < size1.height-1; i++, dptr += dstep - (size1.width-2)*3)
            {
+                uchar* mask = m.ptr(1 + i * 2);
                for( j = 1; j < size1.width-1; j++, dptr += cn )
                {
                    int c0 = dptr[0], c1 = dptr[1], c2 = dptr[2];
@ -437,16 +435,16 @@ void cv::pyrMeanShiftFiltering( InputArray _src, OutputArray _dst,
            }

            cv::dilate( m, m, cv::Mat() );
-            mask = m.ptr();
        }

        dptr = dst_pyramid[level].ptr();
        dstep = (int)dst_pyramid[level].step;

        for( i = 0; i < size.height; i++, sptr += sstep - size.width*3,
-                                          dptr += dstep - size.width*3,
-                                          mask += mstep )
+                                          dptr += dstep - size.width*3
+        )
        {
+            uchar* mask = m.empty() ? NULL : m.ptr(i);
            for( j = 0; j < size.width; j++, sptr += 3, dptr += 3 )
            {
                int x0 = j, y0 = i, x1, y1, iter;
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@ -1820,22 +1820,13 @@ template <>
 void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int)
 {
    int lencn = len*cn;
-    v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
    int i = 0;
-    for (; i <= lencn - 16; i += 16)
-    {
-        v_uint8x16 v_src = v_load(src + i);
-        v_uint16x8 v_tmp0, v_tmp1;
-        v_expand(v_src, v_tmp0, v_tmp1);
-        v_store((uint16_t*)dst + i, v_mul*v_tmp0);
-        v_store((uint16_t*)dst + i + 8, v_mul*v_tmp1);
-    }
-    if (i <= lencn - 8)
-    {
-        v_uint16x8 v_src = v_load_expand(src + i);
-        v_store((uint16_t*)dst + i, v_mul*v_src);
-        i += 8;
-    }
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
+    for (; i <= lencn - VECSZ; i += VECSZ)
+        v_store((uint16_t*)dst + i, v_mul*vx_load_expand(src + i));
+#endif
    for (; i < lencn; i++)
        dst[i] = m[0] * src[i];
 }
@ -1850,20 +1841,11 @@ void hlineSmooth1N1<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const uf
 {
    int lencn = len*cn;
    int i = 0;
-    for (; i <= lencn - 16; i += 16)
-    {
-        v_uint8x16 v_src = v_load(src + i);
-        v_uint16x8 v_tmp0, v_tmp1;
-        v_expand(v_src, v_tmp0, v_tmp1);
-        v_store((uint16_t*)dst + i, v_shl<8>(v_tmp0));
-        v_store((uint16_t*)dst + i + 8, v_shl<8>(v_tmp1));
-    }
-    if (i <= lencn - 8)
-    {
-        v_uint16x8 v_src = v_load_expand(src + i);
-        v_store((uint16_t*)dst + i, v_shl<8>(v_src));
-        i += 8;
-    }
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= lencn - VECSZ; i += VECSZ)
+        v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i)));
+#endif
    for (; i < lencn; i++)
        dst[i] = src[i];
 }
@ -1926,18 +1908,15 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi

        src += cn; dst += cn;
        int i = cn, lencn = (len - 1)*cn;
-        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
-        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
-        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
-            v_expand(v_load(src - cn), v_src00, v_src01);
-            v_expand(v_load(src), v_src10, v_src11);
-            v_expand(v_load(src + cn), v_src20, v_src21);
-            v_store((uint16_t*)dst, v_src00 * v_mul0 + v_src10 * v_mul1 + v_src20 * v_mul2);
-            v_store((uint16_t*)dst + 8, v_src01 * v_mul0 + v_src11 * v_mul1 + v_src21 * v_mul2);
-        }
+#if CV_SIMD
+        const uint16_t* _m = (const uint16_t*)m;
+        const int VECSZ = v_uint16::nlanes;
+        v_uint16 v_mul0 = vx_setall_u16(_m[0]);
+        v_uint16 v_mul1 = vx_setall_u16(_m[1]);
+        v_uint16 v_mul2 = vx_setall_u16(_m[2]);
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, vx_load_expand(src - cn) * v_mul0 + vx_load_expand(src) * v_mul1 + vx_load_expand(src + cn) * v_mul2);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];

@ -2017,15 +1996,11 @@ void hlineSmooth3N121<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const

        src += cn; dst += cn;
        int i = cn, lencn = (len - 1)*cn;
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
-            v_expand(v_load(src - cn), v_src00, v_src01);
-            v_expand(v_load(src), v_src10, v_src11);
-            v_expand(v_load(src + cn), v_src20, v_src21);
-            v_store((uint16_t*)dst, (v_src00 + v_src20 + (v_src10 << 1)) << 6);
-            v_store((uint16_t*)dst + 8, (v_src01 + v_src21 + (v_src11 << 1)) << 6);
-        }
+#if CV_SIMD
+        const int VECSZ = v_uint16::nlanes;
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << 6);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *((uint16_t*)dst) = (uint16_t(src[-cn]) + uint16_t(src[cn]) + (uint16_t(src[0]) << 1)) << 6;

@ -2108,17 +2083,14 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const

        src += cn; dst += cn;
        int i = cn, lencn = (len - 1)*cn;
-        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
-        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)m+1));
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
-            v_expand(v_load(src - cn), v_src00, v_src01);
-            v_expand(v_load(src), v_src10, v_src11);
-            v_expand(v_load(src + cn), v_src20, v_src21);
-            v_store((uint16_t*)dst, (v_src00 + v_src20) * v_mul0 + v_src10 * v_mul1);
-            v_store((uint16_t*)dst + 8, (v_src01 + v_src21) * v_mul0 + v_src11 * v_mul1);
-        }
+#if CV_SIMD
+        const uint16_t* _m = (const uint16_t*)m;
+        const int VECSZ = v_uint16::nlanes;
+        v_uint16 v_mul0 = vx_setall_u16(_m[0]);
+        v_uint16 v_mul1 = vx_setall_u16(_m[1]);
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn)) * v_mul0 + vx_load_expand(src) * v_mul1);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn]));

@ -2304,22 +2276,17 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi

        src += 2 * cn; dst += 2 * cn;
        int i = 2*cn, lencn = (len - 2)*cn;
-        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
-        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
-        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
-        v_uint16x8 v_mul3 = v_setall_u16(*((uint16_t*)(m + 3)));
-        v_uint16x8 v_mul4 = v_setall_u16(*((uint16_t*)(m + 4)));
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
-            v_expand(v_load(src - 2*cn), v_src00, v_src01);
-            v_expand(v_load(src - cn), v_src10, v_src11);
-            v_expand(v_load(src), v_src20, v_src21);
-            v_expand(v_load(src + cn), v_src30, v_src31);
-            v_expand(v_load(src + 2*cn), v_src40, v_src41);
-            v_store((uint16_t*)dst, v_src00 * v_mul0 + v_src10 * v_mul1 + v_src20 * v_mul2 + v_src30 * v_mul3 + v_src40 * v_mul4);
-            v_store((uint16_t*)dst + 8, v_src01 * v_mul0 + v_src11 * v_mul1 + v_src21 * v_mul2 + v_src31 * v_mul3 + v_src41 * v_mul4);
-        }
+#if CV_SIMD
+        const uint16_t* _m = (const uint16_t*)m;
+        const int VECSZ = v_uint16::nlanes;
+        v_uint16 v_mul0 = vx_setall_u16(_m[0]);
+        v_uint16 v_mul1 = vx_setall_u16(_m[1]);
+        v_uint16 v_mul2 = vx_setall_u16(_m[2]);
+        v_uint16 v_mul3 = vx_setall_u16(_m[3]);
+        v_uint16 v_mul4 = vx_setall_u16(_m[4]);
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, vx_load_expand(src - 2 * cn) * v_mul0 + vx_load_expand(src - cn) * v_mul1 + vx_load_expand(src) * v_mul2 + vx_load_expand(src + cn) * v_mul3 + vx_load_expand(src + 2 * cn) * v_mul4);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];

@ -2517,18 +2484,12 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons

        src += 2 * cn; dst += 2 * cn;
        int i = 2 * cn, lencn = (len - 2)*cn;
-        v_uint16x8 v_6 = v_setall_u16(6);
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
-            v_expand(v_load(src - 2*cn), v_src00, v_src01);
-            v_expand(v_load(src - cn), v_src10, v_src11);
-            v_expand(v_load(src), v_src20, v_src21);
-            v_expand(v_load(src + cn), v_src30, v_src31);
-            v_expand(v_load(src + 2*cn), v_src40, v_src41);
-            v_store((uint16_t*)dst, (v_src20 * v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40) << 4);
-            v_store((uint16_t*)dst + 8, (v_src21 * v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41) << 4);
-        }
+#if CV_SIMD
+        const int VECSZ = v_uint16::nlanes;
+        v_uint16 v_6 = vx_setall_u16(6);
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, (vx_load_expand(src) * v_6 + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;

@ -2721,20 +2682,15 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons

        src += 2 * cn; dst += 2 * cn;
        int i = 2 * cn, lencn = (len - 2)*cn;
-        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
-        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
-        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
-        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
-        {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
-            v_expand(v_load(src - 2 * cn), v_src00, v_src01);
-            v_expand(v_load(src - cn), v_src10, v_src11);
-            v_expand(v_load(src), v_src20, v_src21);
-            v_expand(v_load(src + cn), v_src30, v_src31);
-            v_expand(v_load(src + 2 * cn), v_src40, v_src41);
-            v_store((uint16_t*)dst, (v_src00 + v_src40) * v_mul0 + (v_src10 + v_src30)* v_mul1 + v_src20 * v_mul2);
-            v_store((uint16_t*)dst + 8, (v_src01 + v_src41) * v_mul0 + (v_src11 + v_src31) * v_mul1 + v_src21 * v_mul2);
-        }
+#if CV_SIMD
+        const uint16_t* _m = (const uint16_t*)m;
+        const int VECSZ = v_uint16::nlanes;
+        v_uint16 v_mul0 = vx_setall_u16(_m[0]);
+        v_uint16 v_mul1 = vx_setall_u16(_m[1]);
+        v_uint16 v_mul2 = vx_setall_u16(_m[2]);
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((uint16_t*)dst, (vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) * v_mul0 + (vx_load_expand(src - cn) + vx_load_expand(src + cn))* v_mul1 + vx_load_expand(src) * v_mul2);
+#endif
        for (; i < lencn; i++, src++, dst++)
            *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0];

@ -2844,23 +2800,16 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
    }
    i *= cn;
    int lencn = (len - post_shift + 1)*cn;
-    for (; i <= lencn - 16; i+=16, src+=16, dst+=16)
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
    {
-        v_uint16x8 v_src0, v_src1;
-        v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
-        v_expand(v_load(src), v_src0, v_src1);
-        v_uint16x8 v_res0 = v_src0 * v_mul;
-        v_uint16x8 v_res1 = v_src1 * v_mul;
+        v_uint16 v_res0 = vx_load_expand(src) * vx_setall_u16(*((uint16_t*)m));
        for (int j = 1; j < n; j++)
-        {
-            v_mul = v_setall_u16(*((uint16_t*)(m + j)));
-            v_expand(v_load(src + j * cn), v_src0, v_src1);
-            v_res0 += v_src0 * v_mul;
-            v_res1 += v_src1 * v_mul;
-        }
+            v_res0 += vx_load_expand(src + j * cn) * vx_setall_u16(*((uint16_t*)(m + j)));
        v_store((uint16_t*)dst, v_res0);
-        v_store((uint16_t*)dst+8, v_res1);
    }
+#endif
    for (; i < lencn; i++, src++, dst++)
    {
            *dst = m[0] * src[0];
@ -2970,26 +2919,16 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
    }
    i *= cn;
    int lencn = (len - post_shift + 1)*cn;
-    for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
    {
-        v_uint16x8 v_src00, v_src01, v_srcN00, v_srcN01;
-
-        v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)(m + pre_shift)));
-        v_expand(v_load(src + pre_shift * cn), v_src00, v_src01);
-        v_uint16x8 v_res0 = v_src00 * v_mul;
-        v_uint16x8 v_res1 = v_src01 * v_mul;
+        v_uint16 v_res0 = vx_load_expand(src + pre_shift * cn) * vx_setall_u16(*((uint16_t*)(m + pre_shift)));
        for (int j = 0; j < pre_shift; j ++)
-        {
-            v_mul = v_setall_u16(*((uint16_t*)(m + j)));
-            v_expand(v_load(src + j * cn), v_src00, v_src01);
-            v_expand(v_load(src + (n - 1 - j)*cn), v_srcN00, v_srcN01);
-            v_res0 += (v_src00 + v_srcN00) * v_mul;
-            v_res1 += (v_src01 + v_srcN01) * v_mul;
-        }
-
+            v_res0 += (vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn)) * vx_setall_u16(*((uint16_t*)(m + j)));
        v_store((uint16_t*)dst, v_res0);
-        v_store((uint16_t*)dst + 8, v_res1);
    }
+#endif
    for (; i < lencn; i++, src++, dst++)
    {
        *dst = m[pre_shift] * src[pre_shift*cn];
@ -3025,28 +2964,13 @@ template <>
 void vlineSmooth1N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
    const ufixedpoint16* src0 = src[0];
-    v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
-#if CV_SSE2
-    v_uint16x8 v_1 = v_setall_u16(1);
-    v_mul += v_mul;
-#endif
    int i = 0;
-    for (; i <= len - 16; i += 16)
-    {
-        v_uint16x8 v_src0 = v_load((uint16_t*)src0 + i);
-        v_uint16x8 v_src1 = v_load((uint16_t*)src0 + i + 8);
-        v_uint8x16 v_res;
-#if CV_SSE2
-        v_res.val = _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(v_1.val, _mm_mulhi_epu16(v_src0.val, v_mul.val)),1),
-                                     _mm_srli_epi16(_mm_add_epi16(v_1.val, _mm_mulhi_epu16(v_src1.val, v_mul.val)),1));
-#else
-        v_uint32x4 v_res0, v_res1, v_res2, v_res3;
-        v_mul_expand(v_src0, v_mul, v_res0, v_res1);
-        v_mul_expand(v_src1, v_mul, v_res2, v_res3);
-        v_res = v_pack(v_rshr_pack<16>(v_res0, v_res1), v_rshr_pack<16>(v_res2, v_res3));
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1);
+    for (; i <= len - VECSZ; i += VECSZ)
+        v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul));
 #endif
-        v_store(dst + i, v_res);
-    }
    for (; i < len; i++)
        dst[i] = m[0] * src0[i];
 }
@ -3062,8 +2986,11 @@ void vlineSmooth1N1<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, co
 {
    const ufixedpoint16* src0 = src[0];
    int i = 0;
-    for (; i <= len - 8; i += 8)
-        v_rshr_pack_store<8>(dst + i, v_load((uint16_t*)(src0 + i)));
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= len - VECSZ; i += VECSZ)
+        v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i)));
+#endif
    for (; i < len; i++)
        dst[i] = src0[i];
 }
@ -3077,46 +3004,51 @@ template <>
 void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
    int i = 0;
-    static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-    v_int32x4 v_128_4 = v_setall_s32(128 << 16);
-    if (len > 7)
+#if CV_SIMD
+    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+    v_int32 v_128_4 = vx_setall_s32(128 << 16);
+    const int VECSZ = v_uint16::nlanes;
+    if (len >= VECSZ)
    {
        ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) };
-        v_128_4 = v_setall_s32(*((int32_t*)val));
+        v_128_4 = vx_setall_s32(*((int32_t*)val));
    }
-    v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
-    v_int16x8 v_mul2 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 2))));
-    for (; i <= len - 32; i += 32)
+    v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
+    v_int16 v_mul2 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 2))));
+    for (; i <= len - 4*VECSZ; i += 4*VECSZ)
    {
-        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
-        v_int16x8 v_tmp0, v_tmp1;
+        v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
+        v_int16 v_tmp0, v_tmp1;

-        v_src00 = v_load((int16_t*)(src[0]) + i);
-        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
-        v_src10 = v_load((int16_t*)(src[1]) + i);
-        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
-        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
-        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
+        const int16_t* src0 = (const int16_t*)src[0] + i;
+        const int16_t* src1 = (const int16_t*)src[1] + i;
+        v_src00 = vx_load(src0);
+        v_src01 = vx_load(src0 + VECSZ);
+        v_src02 = vx_load(src0 + 2*VECSZ);
+        v_src03 = vx_load(src0 + 3*VECSZ);
+        v_src10 = vx_load(src1);
+        v_src11 = vx_load(src1 + VECSZ);
+        v_src12 = vx_load(src1 + 2*VECSZ);
+        v_src13 = vx_load(src1 + 3*VECSZ);
        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
+        v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01);
+        v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01);
        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
+        v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01);
+        v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01);
        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul01);
+        v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01);
+        v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01);
        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul01);
+        v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01);
+        v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01);

-        v_int32x4 v_resj0, v_resj1;
-        v_src00 = v_load((int16_t*)(src[2]) + i);
-        v_src01 = v_load((int16_t*)(src[2]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[2]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[2]) + i + 24);
+        v_int32 v_resj0, v_resj1;
+        const int16_t* src2 = (const int16_t*)src[2] + i;
+        v_src00 = vx_load(src2);
+        v_src01 = vx_load(src2 + VECSZ);
+        v_src02 = vx_load(src2 + 2*VECSZ);
+        v_src03 = vx_load(src2 + 3*VECSZ);
        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1);
        v_res0 += v_resj0;
        v_res1 += v_resj1;
@ -3139,11 +3071,12 @@ void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
        v_res6 += v_128_4;
        v_res7 += v_128_4;

-        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
-        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+        v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
    }
+#endif
    for (; i < len; i++)
        dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i];
 }
@ -3157,18 +3090,21 @@ template <>
 void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
    int i = 0;
-    for (; i <= len - 16; i += 16)
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
    {
-        v_uint32x4 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
-        v_expand(v_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
-        v_expand(v_load((uint16_t*)(src[0]) + i + 8), v_src02, v_src03);
-        v_expand(v_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
-        v_expand(v_load((uint16_t*)(src[1]) + i + 8), v_src12, v_src13);
-        v_expand(v_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
-        v_expand(v_load((uint16_t*)(src[2]) + i + 8), v_src22, v_src23);
+        v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
+        v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03);
+        v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
+        v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
+        v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
        v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
                                v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
    }
+#endif
    for (; i < len; i++)
        dst[i] = (((uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[2]))[i]) + ((uint32_t)(((uint16_t*)(src[1]))[i]) << 1)) + (1 << 9)) >> 10;
 }
@ -3182,95 +3118,102 @@ template <>
 void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
    int i = 0;
-    static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-    v_int32x4 v_128_4 = v_setall_s32(128 << 16);
-    if (len > 7)
+#if CV_SIMD
+    const int VECSZ = v_uint16::nlanes;
+    if (len >= 4 * VECSZ)
    {
        ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) };
-        v_128_4 = v_setall_s32(*((int32_t*)val));
-    }
-    v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
-    v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m + 2))));
-    v_int16x8 v_mul4 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 4))));
-    for (; i <= len - 32; i += 32)
-    {
-        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
-        v_int16x8 v_tmp0, v_tmp1;
-
-        v_src00 = v_load((int16_t*)(src[0]) + i);
-        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
-        v_src10 = v_load((int16_t*)(src[1]) + i);
-        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
-        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
-        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
-        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
-        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
-        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul01);
-        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul01);
-        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul01);
-
-        v_src00 = v_load((int16_t*)(src[2]) + i);
-        v_src01 = v_load((int16_t*)(src[2]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[2]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[2]) + i + 24);
-        v_src10 = v_load((int16_t*)(src[3]) + i);
-        v_src11 = v_load((int16_t*)(src[3]) + i + 8);
-        v_src12 = v_load((int16_t*)(src[3]) + i + 16);
-        v_src13 = v_load((int16_t*)(src[3]) + i + 24);
-        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-        v_res0 += v_dotprod(v_tmp0, v_mul23);
-        v_res1 += v_dotprod(v_tmp1, v_mul23);
-        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-        v_res2 += v_dotprod(v_tmp0, v_mul23);
-        v_res3 += v_dotprod(v_tmp1, v_mul23);
-        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-        v_res4 += v_dotprod(v_tmp0, v_mul23);
-        v_res5 += v_dotprod(v_tmp1, v_mul23);
-        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-        v_res6 += v_dotprod(v_tmp0, v_mul23);
-        v_res7 += v_dotprod(v_tmp1, v_mul23);
-
-        v_int32x4 v_resj0, v_resj1;
-        v_src00 = v_load((int16_t*)(src[4]) + i);
-        v_src01 = v_load((int16_t*)(src[4]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[4]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[4]) + i + 24);
-        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1);
-        v_res0 += v_resj0;
-        v_res1 += v_resj1;
-        v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1);
-        v_res2 += v_resj0;
-        v_res3 += v_resj1;
-        v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1);
-        v_res4 += v_resj0;
-        v_res5 += v_resj1;
-        v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1);
-        v_res6 += v_resj0;
-        v_res7 += v_resj1;
-
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
-
-        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
-        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+        v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val));
+        static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+        v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
+        v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2))));
+        v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4))));
+        for (; i <= len - 4*VECSZ; i += 4*VECSZ)
+        {
+            v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
+            v_int16 v_tmp0, v_tmp1;
+
+            const int16_t* src0 = (const int16_t*)src[0] + i;
+            const int16_t* src1 = (const int16_t*)src[1] + i;
+            v_src00 = vx_load(src0);
+            v_src01 = vx_load(src0 + VECSZ);
+            v_src02 = vx_load(src0 + 2*VECSZ);
+            v_src03 = vx_load(src0 + 3*VECSZ);
+            v_src10 = vx_load(src1);
+            v_src11 = vx_load(src1 + VECSZ);
+            v_src12 = vx_load(src1 + 2*VECSZ);
+            v_src13 = vx_load(src1 + 3*VECSZ);
+            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
+            v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01);
+            v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01);
+            v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+            v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01);
+            v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01);
+            v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+            v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01);
+            v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01);
+            v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+            v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01);
+            v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01);
+
+            const int16_t* src2 = (const int16_t*)src[2] + i;
+            const int16_t* src3 = (const int16_t*)src[3] + i;
+            v_src00 = vx_load(src2);
+            v_src01 = vx_load(src2 + VECSZ);
+            v_src02 = vx_load(src2 + 2*VECSZ);
+            v_src03 = vx_load(src2 + 3*VECSZ);
+            v_src10 = vx_load(src3);
+            v_src11 = vx_load(src3 + VECSZ);
+            v_src12 = vx_load(src3 + 2*VECSZ);
+            v_src13 = vx_load(src3 + 3*VECSZ);
+            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
+            v_res0 += v_dotprod(v_tmp0, v_mul23);
+            v_res1 += v_dotprod(v_tmp1, v_mul23);
+            v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+            v_res2 += v_dotprod(v_tmp0, v_mul23);
+            v_res3 += v_dotprod(v_tmp1, v_mul23);
+            v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+            v_res4 += v_dotprod(v_tmp0, v_mul23);
+            v_res5 += v_dotprod(v_tmp1, v_mul23);
+            v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+            v_res6 += v_dotprod(v_tmp0, v_mul23);
+            v_res7 += v_dotprod(v_tmp1, v_mul23);
+
+            v_int32 v_resj0, v_resj1;
+            const int16_t* src4 = (const int16_t*)src[4] + i;
+            v_src00 = vx_load(src4);
+            v_src01 = vx_load(src4 + VECSZ);
+            v_src02 = vx_load(src4 + 2*VECSZ);
+            v_src03 = vx_load(src4 + 3*VECSZ);
+            v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1);
+            v_res0 += v_resj0;
+            v_res1 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1);
+            v_res2 += v_resj0;
+            v_res3 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1);
+            v_res4 += v_resj0;
+            v_res5 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1);
+            v_res6 += v_resj0;
+            v_res7 += v_resj1;
+
+            v_res0 += v_128_4;
+            v_res1 += v_128_4;
+            v_res2 += v_128_4;
+            v_res3 += v_128_4;
+            v_res4 += v_128_4;
+            v_res5 += v_128_4;
+            v_res6 += v_128_4;
+            v_res7 += v_128_4;
+
+            v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                              v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+            v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                              v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+        }
    }
+#endif
    for (; i < len; i++)
        dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i];
 }
@ -3284,28 +3227,31 @@ template <>
 void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
    int i = 0;
-    v_uint32x4 v_6 = v_setall_u32(6);
-    for (; i <= len - 16; i += 16)
+#if CV_SIMD
+    v_uint32 v_6 = vx_setall_u32(6);
+    const int VECSZ = v_uint16::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
    {
-        v_uint32x4 v_src00, v_src10, v_src20, v_src30, v_src40;
-        v_uint32x4 v_src01, v_src11, v_src21, v_src31, v_src41;
-        v_uint32x4 v_src02, v_src12, v_src22, v_src32, v_src42;
-        v_uint32x4 v_src03, v_src13, v_src23, v_src33, v_src43;
-        v_expand(v_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
-        v_expand(v_load((uint16_t*)(src[0]) + i + 8), v_src02, v_src03);
-        v_expand(v_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
-        v_expand(v_load((uint16_t*)(src[1]) + i + 8), v_src12, v_src13);
-        v_expand(v_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
-        v_expand(v_load((uint16_t*)(src[2]) + i + 8), v_src22, v_src23);
-        v_expand(v_load((uint16_t*)(src[3]) + i), v_src30, v_src31);
-        v_expand(v_load((uint16_t*)(src[3]) + i + 8), v_src32, v_src33);
-        v_expand(v_load((uint16_t*)(src[4]) + i), v_src40, v_src41);
-        v_expand(v_load((uint16_t*)(src[4]) + i + 8), v_src42, v_src43);
+        v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40;
+        v_uint32 v_src01, v_src11, v_src21, v_src31, v_src41;
+        v_uint32 v_src02, v_src12, v_src22, v_src32, v_src42;
+        v_uint32 v_src03, v_src13, v_src23, v_src33, v_src43;
+        v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03);
+        v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
+        v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
+        v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
+        v_expand(vx_load((uint16_t*)(src[3]) + i), v_src30, v_src31);
+        v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
+        v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41);
+        v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
        v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
                                                v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
                                v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
                                                v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
    }
+#endif
    for (; i < len; i++)
        dst[i] = ((uint32_t)(((uint16_t*)(src[2]))[i]) * 6 +
                  (((uint32_t)(((uint16_t*)(src[1]))[i]) + (uint32_t)(((uint16_t*)(src[3]))[i])) << 2) +
@ -3326,57 +3272,63 @@ template <>
 void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
    int i = 0;
-    static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-    v_int32x4 v_128_4 = v_setall_s32(128 << 16);
-    if (len > 7)
+#if CV_SIMD
+    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+    v_int32 v_128_4 = vx_setall_s32(128 << 16);
+    const int VECSZ = v_uint16::nlanes;
+    if (len >= VECSZ)
    {
        ufixedpoint16 msum = m[0] + m[1];
        for (int j = 2; j < n; j++)
            msum = msum + m[j];
        ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) };
-        v_128_4 = v_setall_s32(*((int32_t*)val));
+        v_128_4 = vx_setall_s32(*((int32_t*)val));
    }
-    for (; i <= len - 32; i += 32)
+    for (; i <= len - 4*VECSZ; i += 4*VECSZ)
    {
-        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
-        v_int16x8 v_tmp0, v_tmp1;
+        v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
+        v_int16 v_tmp0, v_tmp1;

-        v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
+        v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));

-        v_src00 = v_load((int16_t*)(src[0]) + i);
-        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
-        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
-        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
-        v_src10 = v_load((int16_t*)(src[1]) + i);
-        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
-        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
-        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
+        const int16_t* src0 = (const int16_t*)src[0] + i;
+        const int16_t* src1 = (const int16_t*)src[1] + i;
+        v_src00 = vx_load(src0);
+        v_src01 = vx_load(src0 + VECSZ);
+        v_src02 = vx_load(src0 + 2*VECSZ);
+        v_src03 = vx_load(src0 + 3*VECSZ);
+        v_src10 = vx_load(src1);
+        v_src11 = vx_load(src1 + VECSZ);
+        v_src12 = vx_load(src1 + 2*VECSZ);
+        v_src13 = vx_load(src1 + 3*VECSZ);
        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res0 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res1 = v_dotprod(v_tmp1, v_mul);
        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);
        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res4 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res5 = v_dotprod(v_tmp1, v_mul);
        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul);
+        v_int32 v_res6 = v_dotprod(v_tmp0, v_mul);
+        v_int32 v_res7 = v_dotprod(v_tmp1, v_mul);

        int j = 2;
        for (; j < n - 1; j+=2)
        {
-            v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m+j))));
+            v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j))));

-            v_src00 = v_load((int16_t*)(src[j]) + i);
-            v_src01 = v_load((int16_t*)(src[j]) + i + 8);
-            v_src02 = v_load((int16_t*)(src[j]) + i + 16);
-            v_src03 = v_load((int16_t*)(src[j]) + i + 24);
-            v_src10 = v_load((int16_t*)(src[j+1]) + i);
-            v_src11 = v_load((int16_t*)(src[j+1]) + i + 8);
-            v_src12 = v_load((int16_t*)(src[j+1]) + i + 16);
-            v_src13 = v_load((int16_t*)(src[j+1]) + i + 24);
+            const int16_t* srcj0 = (const int16_t*)src[j] + i;
+            const int16_t* srcj1 = (const int16_t*)src[j + 1] + i;
+            v_src00 = vx_load(srcj0);
+            v_src01 = vx_load(srcj0 + VECSZ);
+            v_src02 = vx_load(srcj0 + 2*VECSZ);
+            v_src03 = vx_load(srcj0 + 3*VECSZ);
+            v_src10 = vx_load(srcj1);
+            v_src11 = vx_load(srcj1 + VECSZ);
+            v_src12 = vx_load(srcj1 + 2*VECSZ);
+            v_src13 = vx_load(srcj1 + 3*VECSZ);
            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
            v_res0 += v_dotprod(v_tmp0, v_mul);
            v_res1 += v_dotprod(v_tmp1, v_mul);
@ -3392,12 +3344,13 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
        }
        if(j < n)
        {
-            v_int32x4 v_resj0, v_resj1;
-            v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + j))));
-            v_src00 = v_load((int16_t*)(src[j]) + i);
-            v_src01 = v_load((int16_t*)(src[j]) + i + 8);
-            v_src02 = v_load((int16_t*)(src[j]) + i + 16);
-            v_src03 = v_load((int16_t*)(src[j]) + i + 24);
+            v_int32 v_resj0, v_resj1;
+            v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j))));
+            const int16_t* srcj = (const int16_t*)src[j] + i;
+            v_src00 = vx_load(srcj);
+            v_src01 = vx_load(srcj + VECSZ);
+            v_src02 = vx_load(srcj + 2*VECSZ);
+            v_src03 = vx_load(srcj + 3*VECSZ);
            v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1);
            v_res0 += v_resj0;
            v_res1 += v_resj1;
@ -3420,11 +3373,12 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
        v_res6 += v_128_4;
        v_res7 += v_128_4;

-        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
-        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+        v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
    }
+#endif
    for (; i < len; i++)
    {
        ufixedpoint32 val = m[0] * src[0][i];
@ -3450,29 +3404,32 @@ void vlineSmoothONa_yzy_a(const FT* const * src, const FT* m, int n, ET* dst, in
 template <>
 void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
-    int pre_shift = n / 2;
    int i = 0;
-    static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-    v_int32x4 v_128_4 = v_setall_s32(128 << 16);
-    if (len > 7)
+#if CV_SIMD
+    int pre_shift = n / 2;
+    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+    v_int32 v_128_4 = vx_setall_s32(128 << 16);
+    const int VECSZ = v_uint16::nlanes;
+    if (len >= VECSZ)
    {
        ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1];
        for (int j = 1; j < pre_shift; j++)
            msum = msum + m[j] + m[n - 1 - j];
        ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) };
-        v_128_4 = v_setall_s32(*((int32_t*)val));
+        v_128_4 = vx_setall_s32(*((int32_t*)val));
    }
-    for (; i <= len - 32; i += 32)
+    for (; i <= len - 4*VECSZ; i += 4*VECSZ)
    {
-        v_int16x8 v_src00, v_src10, v_src20, v_src30, v_src01, v_src11, v_src21, v_src31;
-        v_int32x4 v_res0, v_res1, v_res2, v_res3, v_res4, v_res5, v_res6, v_res7;
-        v_int16x8 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;
+        v_int16 v_src00, v_src10, v_src20, v_src30, v_src01, v_src11, v_src21, v_src31;
+        v_int32 v_res0, v_res1, v_res2, v_res3, v_res4, v_res5, v_res6, v_res7;
+        v_int16 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;

-        v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + pre_shift))));
-        v_src00 = v_load((int16_t*)(src[pre_shift]) + i);
-        v_src10 = v_load((int16_t*)(src[pre_shift]) + i + 8);
-        v_src20 = v_load((int16_t*)(src[pre_shift]) + i + 16);
-        v_src30 = v_load((int16_t*)(src[pre_shift]) + i + 24);
+        v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + pre_shift))));
+        const int16_t* srcp = (const int16_t*)src[pre_shift] + i;
+        v_src00 = vx_load(srcp);
+        v_src10 = vx_load(srcp + VECSZ);
+        v_src20 = vx_load(srcp + 2*VECSZ);
+        v_src30 = vx_load(srcp + 3*VECSZ);
        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_res0, v_res1);
        v_mul_expand(v_add_wrap(v_src10, v_128), v_mul, v_res2, v_res3);
        v_mul_expand(v_add_wrap(v_src20, v_128), v_mul, v_res4, v_res5);
@ -3481,16 +3438,18 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
        int j = 0;
        for (; j < pre_shift; j++)
        {
-            v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + j))));
+            v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j))));

-            v_src00 = v_load((int16_t*)(src[j]) + i);
-            v_src10 = v_load((int16_t*)(src[j]) + i + 8);
-            v_src20 = v_load((int16_t*)(src[j]) + i + 16);
-            v_src30 = v_load((int16_t*)(src[j]) + i + 24);
-            v_src01 = v_load((int16_t*)(src[n - 1 - j]) + i);
-            v_src11 = v_load((int16_t*)(src[n - 1 - j]) + i + 8);
-            v_src21 = v_load((int16_t*)(src[n - 1 - j]) + i + 16);
-            v_src31 = v_load((int16_t*)(src[n - 1 - j]) + i + 24);
+            const int16_t* srcj0 = (const int16_t*)src[j] + i;
+            const int16_t* srcj1 = (const int16_t*)src[n - 1 - j] + i;
+            v_src00 = vx_load(srcj0);
+            v_src10 = vx_load(srcj0 + VECSZ);
+            v_src20 = vx_load(srcj0 + 2*VECSZ);
+            v_src30 = vx_load(srcj0 + 3*VECSZ);
+            v_src01 = vx_load(srcj1);
+            v_src11 = vx_load(srcj1 + VECSZ);
+            v_src21 = vx_load(srcj1 + 2*VECSZ);
+            v_src31 = vx_load(srcj1 + 3*VECSZ);
            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1);
            v_res0 += v_dotprod(v_tmp0, v_mul);
            v_res1 += v_dotprod(v_tmp1, v_mul);
@ -3514,11 +3473,12 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
        v_res6 += v_128_4;
        v_res7 += v_128_4;

-        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
-        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
-                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+        v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                          v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
    }
+#endif
    for (; i < len; i++)
    {
        ufixedpoint32 val = m[0] * src[0][i];
@ -3816,8 +3776,8 @@ static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize,
    if( ksize.height <= 0 && sigma2 > 0 )
        ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;

-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
-        ksize.height > 0 && ksize.height % 2 == 1 );
+    CV_Assert( ksize.width  > 0 && ksize.width  % 2 == 1 &&
+               ksize.height > 0 && ksize.height % 2 == 1 );

    sigma1 = std::max( sigma1, 0. );
    sigma2 = std::max( sigma2, 0. );
@ -4146,20 +4106,6 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,

    int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

-    if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix()))
-    {
-        std::vector<ufixedpoint16> fkx, fky;
-        createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2);
-        Mat src = _src.getMat();
-        Mat dst = _dst.getMat();
-        if (src.data == dst.data)
-            src = src.clone();
-        fixedSmoothInvoker<uint8_t, ufixedpoint16> invoker(src.ptr<uint8_t>(), src.step1(), dst.ptr<uint8_t>(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED);
-        parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs())));
-        return;
-    }
-
-
    Mat kx, ky;
    createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);

@ -4185,6 +4131,17 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,

    CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));

+    if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix()))
+    {
+        std::vector<ufixedpoint16> fkx, fky;
+        createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2);
+        if (src.data == dst.data)
+            src = src.clone();
+        fixedSmoothInvoker<uint8_t, ufixedpoint16> invoker(src.ptr<uint8_t>(), src.step1(), dst.ptr<uint8_t>(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED);
+        parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs())));
+        return;
+    }
+
    sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType);
 }

--- a/modules/imgproc/src/subdivision2d.cpp
+++ b/modules/imgproc/src/subdivision2d.cpp
@ -758,6 +758,7 @@ void Subdiv2D::getTriangleList(std::vector<Vec6f>& triangleList) const
    triangleList.clear();
    int i, total = (int)(qedges.size()*4);
    std::vector<bool> edgemask(total, false);
+    Rect2f rect(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y);

    for( i = 4; i < total; i += 2 )
    {
@ -773,7 +774,8 @@ void Subdiv2D::getTriangleList(std::vector<Vec6f>& triangleList) const
        edge = getEdge(edge, NEXT_AROUND_LEFT);
        edgeOrg(edge, &c);
        edgemask[edge] = true;
-        triangleList.push_back(Vec6f(a.x, a.y, b.x, b.y, c.x, c.y));
+        if( rect.contains(a) && rect.contains(b) && rect.contains(c) )
+            triangleList.push_back(Vec6f(a.x, a.y, b.x, b.y, c.x, c.y));
    }
 }

--- a/modules/imgproc/test/test_distancetransform.cpp
+++ b/modules/imgproc/test/test_distancetransform.cpp
@ -283,4 +283,23 @@ void CV_DisTransTest::prepare_to_validation( int /*test_case_idx*/ )

 TEST(Imgproc_DistanceTransform, accuracy) { CV_DisTransTest test; test.safe_run(); }

+BIGDATA_TEST(Imgproc_DistanceTransform, large_image_12218)
+{
+    const int lls_maxcnt = 79992000;   // labels's maximum count
+    const int lls_mincnt = 1;          // labels's minimum count
+    int i, j, nz;
+    Mat src(8000, 20000, CV_8UC1), dst, labels;
+    for( i = 0; i < src.rows; i++ )
+        for( j = 0; j < src.cols; j++ )
+            src.at<uchar>(i, j) = (j > (src.cols / 2)) ? 0 : 255;
+
+    distanceTransform(src, dst, labels, cv::DIST_L2, cv::DIST_MASK_3, DIST_LABEL_PIXEL);
+
+    double scale = (double)lls_mincnt / (double)lls_maxcnt;
+    labels.convertTo(labels, CV_32SC1, scale);
+    Size size = labels.size();
+    nz = cv::countNonZero(labels);
+    EXPECT_EQ(nz, (size.height*size.width / 2));
+}
+
 }} // namespace
--- a/modules/imgproc/test/test_subdivision2d.cpp
+++ b/modules/imgproc/test/test_subdivision2d.cpp
@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//M*/
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+TEST(Imgproc_Subdiv2D_getTriangleList, regression_5788)
+{
+    const float points[65][2] = {
+        { 390,  802}, { 397,  883}, { 414,  963 }, { 439, 1042 }, { 472, 1113},
+        { 521, 1181}, { 591, 1238}, { 678, 1284 }, { 771, 1292 }, { 853, 1281},
+        { 921, 1243}, { 982, 1191}, {1030, 1121 }, {1059, 1038 }, {1072,  945},
+        {1081,  849}, {1082,  749}, { 459,  734 }, { 502,  704 }, { 554,  696},
+        { 609,  698}, { 660,  707}, { 818,  688 }, { 874,  661 }, { 929,  646},
+        { 982,  653}, {1026,  682}, { 740,  771 }, { 748,  834 }, { 756,  897},
+        { 762,  960}, { 700,  998}, { 733, 1006 }, { 766, 1011 }, { 797,  999},
+        { 825,  987}, { 528,  796}, { 566,  766 }, { 617,  763 }, { 659,  794},
+        { 619,  808}, { 569,  812}, { 834,  777 }, { 870,  735 }, { 918,  729},
+        { 958,  750}, { 929,  773}, { 882,  780 }, { 652, 1102 }, { 701, 1079},
+        { 743, 1063}, { 774, 1068}, { 807, 1057 }, { 852, 1065 }, { 896, 1077},
+        { 860, 1117}, { 820, 1135}, { 783, 1141 }, { 751, 1140 }, { 706, 1130},
+        { 675, 1102}, { 743, 1094}, { 774, 1094 }, { 809, 1088 }, { 878, 1082}
+    };
+    std::vector<cv::Point2f> pts;
+    cv::Rect rect(0, 0, 1500, 2000);
+    cv::Subdiv2D subdiv(rect);
+    for( int i = 0; i < 65; i++ )
+    {
+        cv::Point2f pt(points[i][0], points[i][1]);
+        pts.push_back(pt);
+    }
+
+    subdiv.insert(pts);
+
+    std::vector<cv::Vec6f> triangles;
+    subdiv.getTriangleList(triangles);
+
+    int trig_cnt = 0;
+    for( std::vector<cv::Vec6f>::const_iterator it = triangles.begin(); it != triangles.end(); it++, trig_cnt++ )
+    {
+        EXPECT_TRUE( (0 <= triangles.at(trig_cnt).val[0] && triangles.at(trig_cnt).val[0] < 1500) &&
+                     (0 <= triangles.at(trig_cnt).val[1] && triangles.at(trig_cnt).val[1] < 2000) &&
+                     (0 <= triangles.at(trig_cnt).val[2] && triangles.at(trig_cnt).val[2] < 1500) &&
+                     (0 <= triangles.at(trig_cnt).val[3] && triangles.at(trig_cnt).val[3] < 2000) &&
+                     (0 <= triangles.at(trig_cnt).val[4] && triangles.at(trig_cnt).val[4] < 1500) &&
+                     (0 <= triangles.at(trig_cnt).val[5] && triangles.at(trig_cnt).val[5] < 2000) );
+    }
+    EXPECT_EQ(trig_cnt, 105);
+}
+
+}};
--- a/modules/java/generator/src/cpp/opencv_java.hpp
+++ b/modules/java/generator/src/cpp/opencv_java.hpp
@ -29,7 +29,7 @@

 #define ARRAYLIST(ENV) static_cast<jclass>(ENV->NewGlobalRef(ENV->FindClass("java/util/ArrayList")))
 #define LIST_ADD(ENV, LIST) ENV->GetMethodID(LIST, "add", "(Ljava/lang/Object;)Z")
-#define LIST_GET(ENV, LIST) ENV->GetMethodID(LIST, "get", "((I)Ljava/lang/Object;")
+#define LIST_GET(ENV, LIST) ENV->GetMethodID(LIST, "get", "(I)Ljava/lang/Object;")
 #define LIST_SIZE(ENV, LIST) ENV->GetMethodID(LIST, "size", "()I")
 #define LIST_CLEAR(ENV, LIST) ENV->GetMethodID(LIST, "clear", "()V")

--- a/modules/js/CMakeLists.txt
+++ b/modules/js/CMakeLists.txt
@ -56,7 +56,7 @@ add_custom_command(
   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/embindgen.py
   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/templates.py
   DEPENDS ${scripts_hdr_parser}
-   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/headers.txt
+   #(not needed - generated by CMake) DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/headers.txt
   DEPENDS ${opencv_hdrs}
   DEPENDS ${JS_HELPER})

--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@ -68,15 +68,10 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //M*/

-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/video/tracking.hpp"
-#include "opencv2/video/background_segm.hpp"
-#include "opencv2/objdetect.hpp"
-#include "opencv2/dnn.hpp"
-
 #include <emscripten/bind.h>

+@INCLUDES@
+
 using namespace emscripten;
 using namespace cv;
 using namespace dnn;
--- a/modules/js/src/embindgen.py
+++ b/modules/js/src/embindgen.py
@ -733,12 +733,14 @@ class JSWrapperGenerator(object):

    def gen(self, dst_file, src_files, core_bindings):
        # step 1: scan the headers and extract classes, enums and functions
+        headers = []
        for hdr in src_files:
            decls = self.parser.parse(hdr)
            # print(hdr);
            # self.print_decls(decls);
            if len(decls) == 0:
                continue
+            headers.append(hdr[hdr.rindex('opencv2/'):])
            for decl in decls:
                name = decl[0]
                type = name[:name.find(" ")]
@ -890,6 +892,9 @@ class JSWrapperGenerator(object):
        with open(core_bindings) as f:
            ret = f.read()

+        header_includes = '\n'.join(['#include "{}"'.format(hdr) for hdr in headers])
+        ret = ret.replace('@INCLUDES@', header_includes)
+
        defis = '\n'.join(self.wrapper_funcs)
        ret += wrapper_codes_template.substitute(ns=wrapper_namespace, defs=defis)
        ret += emscripten_binding_template.substitute(binding_name='testBinding', bindings=''.join(self.bindings))
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@ -140,13 +140,12 @@ public:
    String getModelName() const CV_OVERRIDE { return NAME_BRUTE_FORCE; }
    int getType() const CV_OVERRIDE { return ml::KNearest::BRUTE_FORCE; }

-    void findNearestCore( const Mat& _samples, int k0, const Range& range,
+    void findNearestCore( const Mat& _samples, int k, const Range& range,
                          Mat* results, Mat* neighbor_responses,
                          Mat* dists, float* presult ) const
    {
        int testidx, baseidx, i, j, d = samples.cols, nsamples = samples.rows;
        int testcount = range.end - range.start;
-        int k = std::min(k0, nsamples);

        AutoBuffer<float> buf(testcount*k*2);
        float* dbuf = buf.data();
@ -215,7 +214,7 @@ public:
                float* nr = neighbor_responses->ptr<float>(testidx + range.start);
                for( j = 0; j < k; j++ )
                    nr[j] = rbuf[testidx*k + j];
-                for( ; j < k0; j++ )
+                for( ; j < k; j++ )
                    nr[j] = 0.f;
            }

@ -224,7 +223,7 @@ public:
                float* dptr = dists->ptr<float>(testidx + range.start);
                for( j = 0; j < k; j++ )
                    dptr[j] = dbuf[testidx*k + j];
-                for( ; j < k0; j++ )
+                for( ; j < k; j++ )
                    dptr[j] = 0.f;
            }

@ -307,6 +306,7 @@ public:
    {
        float result = 0.f;
        CV_Assert( 0 < k );
+        k = std::min(k, samples.rows);

        Mat test_samples = _samples.getMat();
        CV_Assert( test_samples.type() == CV_32F && test_samples.cols == samples.cols );
@ -363,6 +363,7 @@ public:
    {
        float result = 0.f;
        CV_Assert( 0 < k );
+        k = std::min(k, samples.rows);

        Mat test_samples = _samples.getMat();
        CV_Assert( test_samples.type() == CV_32F && test_samples.cols == samples.cols );
--- a/modules/ml/test/test_emknearestkmeans.cpp
+++ b/modules/ml/test/test_emknearestkmeans.cpp
@ -702,4 +702,26 @@ TEST(ML_EM, accuracy) { CV_EMTest test; test.safe_run(); }
 TEST(ML_EM, save_load) { CV_EMTest_SaveLoad test; test.safe_run(); }
 TEST(ML_EM, classification) { CV_EMTest_Classification test; test.safe_run(); }

+TEST(ML_KNearest, regression_12347)
+{
+    Mat xTrainData = (Mat_<float>(5,2) << 1, 1.1, 1.1, 1, 2, 2, 2.1, 2, 2.1, 2.1);
+    Mat yTrainLabels = (Mat_<float>(5,1) << 1, 1, 2, 2, 2);
+    Ptr<KNearest> knn = KNearest::create();
+    knn->train(xTrainData, ml::ROW_SAMPLE, yTrainLabels);
+
+    Mat xTestData = (Mat_<float>(2,2) << 1.1, 1.1, 2, 2.2);
+    Mat zBestLabels, neighbours, dist;
+    // check output shapes:
+    int K = 16, Kexp = std::min(K, xTrainData.rows);
+    knn->findNearest(xTestData, K, zBestLabels, neighbours, dist);
+    EXPECT_EQ(xTestData.rows, zBestLabels.rows);
+    EXPECT_EQ(neighbours.cols, Kexp);
+    EXPECT_EQ(dist.cols, Kexp);
+    // see if the result is still correct:
+    K = 2;
+    knn->findNearest(xTestData, K, zBestLabels, neighbours, dist);
+    EXPECT_EQ(1, zBestLabels.at<float>(0,0));
+    EXPECT_EQ(2, zBestLabels.at<float>(1,0));
+}
+
 }} // namespace
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
--- a/modules/ts/misc/run.py
+++ b/modules/ts/misc/run.py
@ -40,6 +40,9 @@ if __name__ == "__main__":
    parser.add_argument("--valgrind_supp", metavar="FILE", action='append', help="Path to valgrind suppression file (example: --valgrind_supp opencv/platforms/scripts/valgrind.supp)")
    parser.add_argument("--valgrind_opt", metavar="OPT", action="append", default=[], help="Add command line option to valgrind (example: --valgrind_opt=--leak-check=full)")

+    # QEMU
+    parser.add_argument("--qemu", default="", help="Specify qemu binary and base parameters")
+
    # Android
    parser.add_argument("--android", action="store_true", default=False, help="Android: force all tests to run on device")
    parser.add_argument("--android_sdk", metavar="PATH", help="Android: path to SDK to use adb and aapt tools")
--- a/modules/ts/misc/run_suite.py
+++ b/modules/ts/misc/run_suite.py
@ -77,7 +77,7 @@ class TestSuite(object):
            return False
        return os.access(fullpath, os.X_OK)

-    def wrapInValgrind(self, cmd=[]):
+    def wrapCommand(self, cmd, env):
        if self.options.valgrind:
            res = ['valgrind']
            supp = self.options.valgrind_supp or []
@ -89,6 +89,14 @@ class TestSuite(object):
            res.extend(self.options.valgrind_opt)
            has_gtest_filter = next((True for x in cmd if x.startswith('--gtest_filter=')), False)
            return res + cmd + ([longTestFilter(LONG_TESTS_DEBUG_VALGRIND)] if not has_gtest_filter else [])
+        elif self.options.qemu:
+            import shlex
+            res = shlex.split(self.options.qemu)
+            for (name, value) in [entry for entry in os.environ.items() if entry[0].startswith('OPENCV') and not entry[0] in env]:
+                res += ['-E', '"{}={}"'.format(name, value)]
+            for (name, value) in env.items():
+                res += ['-E', '"{}={}"'.format(name, value)]
+            return res + ['--'] + cmd
        return cmd

    def tryCommand(self, cmd, workingDir):
@ -125,7 +133,6 @@ class TestSuite(object):
        else:
            if isColorEnabled(args):
                args.append("--gtest_color=yes")
-            cmd = self.wrapInValgrind([exe] + args)
            env = {}
            if not self.options.valgrind and self.options.trace:
                env['OPENCV_TRACE'] = '1'
@ -133,6 +140,7 @@ class TestSuite(object):
                env['OPENCV_TRACE_SYNC_OPENCL'] = '1'
            tempDir = TempEnvDir('OPENCV_TEMP_PATH', "__opencv_temp.")
            tempDir.init()
+            cmd = self.wrapCommand([exe] + args, env)
            log.warning("Run: %s" % " ".join(cmd))
            ret = execute(cmd, cwd=workingDir, env=env)
            try:
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@ -721,6 +721,7 @@ void checkIppStatus()
    }
 }

+static bool checkTestData = false;
 bool skipUnstableTests = false;
 bool runBigDataTests = false;
 int testThreads = 0;
@ -733,6 +734,7 @@ void parseCustomOptions(int argc, char **argv)
        "{ test_threads       |-1       |the number of worker threads, if parallel execution is enabled}"
        "{ skip_unstable      |false    |skip unstable tests }"
        "{ test_bigdata       |false    |run BigData tests (>=2Gb) }"
+        "{ test_require_data  |false    |fail on missing non-required test data instead of skip}"
        "{ h   help           |false    |print help info                          }";

    cv::CommandLineParser parser(argc, argv, command_line_keys);
@ -756,6 +758,7 @@ void parseCustomOptions(int argc, char **argv)

    skipUnstableTests = parser.get<bool>("skip_unstable");
    runBigDataTests = parser.get<bool>("test_bigdata");
+    checkTestData = parser.get<bool>("test_require_data");
 }


@ -870,7 +873,7 @@ static std::string findData(const std::string& relative_path, bool required, boo
 #endif
 #endif
    const char* type = findDirectory ? "directory" : "data file";
-    if (required)
+    if (required || checkTestData)
        CV_Error(cv::Error::StsError, cv::format("OpenCV tests: Can't find required %s: %s", type, relative_path.c_str()));
    throw SkipTestException(cv::format("OpenCV tests: Can't find %s: %s", type, relative_path.c_str()));
 }
--- a/modules/ts/src/ts_gtest.cpp
+++ b/modules/ts/src/ts_gtest.cpp
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@ -616,7 +616,7 @@ class SourceReaderCB : public IMFSourceReaderCallback
 {
 public:
    SourceReaderCB() :
-        m_nRefCount(1), m_hEvent(CreateEvent(NULL, FALSE, FALSE, NULL)), m_bEOS(FALSE), m_hrStatus(S_OK), m_dwStreamIndex(0)
+        m_nRefCount(0), m_hEvent(CreateEvent(NULL, FALSE, FALSE, NULL)), m_bEOS(FALSE), m_hrStatus(S_OK), m_reader(NULL), m_dwStreamIndex(0)
    {
    }

@ -677,7 +677,7 @@ public:
    BOOL                m_bEOS;
    HRESULT             m_hrStatus;

-    _ComPtr<IMFSourceReader> m_reader;
+    IMFSourceReader *m_reader;
    DWORD m_dwStreamIndex;
    _ComPtr<IMFSample>  m_lastSample;
 };
@ -1140,7 +1140,7 @@ bool CvCapture_MSMF::grabFrame()
        if (!reader->m_reader)
        {
            // Initiate capturing with async callback
-            reader->m_reader = videoFileSource;
+            reader->m_reader = videoFileSource.Get();
            reader->m_dwStreamIndex = dwStreamIndex;
            if (FAILED(hr = videoFileSource->ReadSample(dwStreamIndex, 0, NULL, NULL, NULL, NULL)))
            {
--- a/Show More
+++ b/Show More