Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-07 09:25:45 +08:00 · 2019-10-24 18:17:40 +00:00 · 2019-10-24 18:17:40 +00:00 · 055ffc0425
commit 055ffc0425
parent 87a692b89e eabe679f78
61 changed files with 1438 additions and 394 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -346,7 +346,7 @@ elseif(MIPS)
  ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA")
  ocv_update(CPU_MSA_FLAGS_ON "-mmsa")
-  set(CPU_BASELINE "MSA" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
 elseif(PPC64LE)
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
  ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@ -133,7 +133,7 @@ message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
 set(HAVE_MKL ON)
 set(MKL_ROOT_DIR "${MKL_ROOT_DIR}" CACHE PATH "Path to MKL directory")
 set(MKL_INCLUDE_DIRS "${MKL_INCLUDE_DIRS}" CACHE PATH "Path to MKL include directory")
-set(MKL_LIBRARIES "${MKL_LIBRARIES}" CACHE STRING "MKL libarries")
+set(MKL_LIBRARIES "${MKL_LIBRARIES}" CACHE STRING "MKL libraries")
 if(UNIX AND NOT MKL_LIBRARIES_DONT_HACK)
    #it's ugly but helps to avoid cyclic lib problem
    set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
--- a/cmake/platforms/OpenCV-WindowsPhone.cmake
+++ b/cmake/platforms/OpenCV-WindowsPhone.cmake
@ -1,4 +1,4 @@
-include("${CMAKE_CURRENT_LIST_DIR}/OpenCV_WinRT.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/OpenCV-WinRT.cmake")

 # Adding additional using directory for WindowsPhone 8.0 to get Windows.winmd properly
 if(WINRT_8_0)
--- a/cmake/platforms/OpenCV-WindowsStore.cmake
+++ b/cmake/platforms/OpenCV-WindowsStore.cmake
@ -1 +1 @@
-include("${CMAKE_CURRENT_LIST_DIR}/OpenCV_WinRT.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/OpenCV-WinRT.cmake")
--- a/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
+++ b/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
@ -27,7 +27,7 @@ src1.delete(); src2.delete(); dst.delete(); mask.delete();
 Image Subtraction
 --------------

-You can subtract two images by OpenCV function, cv.subtract(). res = img1 - img2. Both images should be of same depth and type.
+You can subtract two images by OpenCV function, cv.subtract(). res = img1 - img2. Both images should be of same depth and type. Note that when used with RGBA images, the alpha channel is also subtracted.

 For example, consider below sample:
@code{.js}
@ -59,4 +59,4 @@ Try it
 <iframe src="../../js_image_arithmetics_bitwise.html" width="100%"
        onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
 </iframe>
-\endhtmlonly
+\endhtmlonly
--- a/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown
+++ b/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown
@ -4,21 +4,21 @@ Gui Features in OpenCV {#tutorial_py_table_of_contents_gui}
 -   @subpage tutorial_py_image_display

    Learn to load an
-    image, display it and save it back
+    image, display it, and save it back

 -   @subpage tutorial_py_video_display

    Learn to play videos,
-    capture videos from Camera and write it as a video
+    capture videos from a camera, and write videos

 -   @subpage tutorial_py_drawing_functions

    Learn to draw lines,
-    rectangles, ellipses, circles etc with OpenCV
+    rectangles, ellipses, circles, etc with OpenCV

 -   @subpage tutorial_py_mouse_handling

-    Draw stuffs with your
+    Draw stuff with your
    mouse

 -   @subpage tutorial_py_trackbar
--- a/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
+++ b/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
@ -4,19 +4,19 @@ Getting Started with Videos {#tutorial_py_video_display}
 Goal
 ----

-   Learn to read video, display video and save video.
-   Learn to capture from Camera and display it.
+-   Learn to read video, display video, and save video.
+-   Learn to capture video from a camera and display it.
 -   You will learn these functions : **cv.VideoCapture()**, **cv.VideoWriter()**

 Capture Video from Camera
 -------------------------

-Often, we have to capture live stream with camera. OpenCV provides a very simple interface to this.
-Let's capture a video from the camera (I am using the in-built webcam of my laptop), convert it into
+Often, we have to capture live stream with a camera. OpenCV provides a very simple interface to do this.
+Let's capture a video from the camera (I am using the built-in webcam on my laptop), convert it into
 grayscale video and display it. Just a simple task to get started.

 To capture a video, you need to create a **VideoCapture** object. Its argument can be either the
-device index or the name of a video file. Device index is just the number to specify which camera.
+device index or the name of a video file. A device index is just the number to specify which camera.
 Normally one camera will be connected (as in my case). So I simply pass 0 (or -1). You can select
 the second camera by passing 1 and so on. After that, you can capture frame-by-frame. But at the
 end, don't forget to release the capture.
@ -46,16 +46,16 @@ while True:
 # When everything done, release the capture
 cap.release()
 cv.destroyAllWindows()@endcode
-`cap.read()` returns a bool (`True`/`False`). If frame is read correctly, it will be `True`. So you can
-check end of the video by checking this return value.
+`cap.read()` returns a bool (`True`/`False`). If the frame is read correctly, it will be `True`. So you can
+check for the end of the video by checking this returned value.

-Sometimes, cap may not have initialized the capture. In that case, this code shows error. You can
+Sometimes, cap may not have initialized the capture. In that case, this code shows an error. You can
 check whether it is initialized or not by the method **cap.isOpened()**. If it is `True`, OK.
 Otherwise open it using **cap.open()**.

 You can also access some of the features of this video using **cap.get(propId)** method where propId
 is a number from 0 to 18. Each number denotes a property of the video (if it is applicable to that
-video) and full details can be seen here: cv::VideoCapture::get().
+video). Full details can be seen here: cv::VideoCapture::get().
 Some of these values can be modified using **cap.set(propId, value)**. Value is the new value you
 want.

@ -63,13 +63,13 @@ For example, I can check the frame width and height by `cap.get(cv.CAP_PROP_FRAM
 640x480 by default. But I want to modify it to 320x240. Just use `ret = cap.set(cv.CAP_PROP_FRAME_WIDTH,320)` and
 `ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT,240)`.

-@note If you are getting error, make sure camera is working fine using any other camera application
+@note If you are getting an error, make sure your camera is working fine using any other camera application
 (like Cheese in Linux).

 Playing Video from file
 -----------------------

-It is same as capturing from Camera, just change camera index with video file name. Also while
+Playing video from file is the same as capturing it from camera, just change the camera index to a video file name. Also while
 displaying the frame, use appropriate time for `cv.waitKey()`. If it is too less, video will be very
 fast and if it is too high, video will be slow (Well, that is how you can display videos in slow
 motion). 25 milliseconds will be OK in normal cases.
@ -96,23 +96,23 @@ cap.release()
 cv.destroyAllWindows()
@endcode

-@note Make sure proper versions of ffmpeg or gstreamer is installed. Sometimes, it is a headache to
-work with Video Capture mostly due to wrong installation of ffmpeg/gstreamer.
+@note Make sure a proper version of ffmpeg or gstreamer is installed. Sometimes it is a headache to
+work with video capture, mostly due to wrong installation of ffmpeg/gstreamer.

 Saving a Video
 --------------

-So we capture a video, process it frame-by-frame and we want to save that video. For images, it is
-very simple, just use `cv.imwrite()`. Here a little more work is required.
+So we capture a video and process it frame-by-frame, and we want to save that video. For images, it is
+very simple: just use `cv.imwrite()`. Here, a little more work is required.

 This time we create a **VideoWriter** object. We should specify the output file name (eg:
 output.avi). Then we should specify the **FourCC** code (details in next paragraph). Then number of
-frames per second (fps) and frame size should be passed. And last one is **isColor** flag. If it is
-`True`, encoder expect color frame, otherwise it works with grayscale frame.
+frames per second (fps) and frame size should be passed. And the last one is the **isColor** flag. If it is
+`True`, the encoder expect color frame, otherwise it works with grayscale frame.

 [FourCC](http://en.wikipedia.org/wiki/FourCC) is a 4-byte code used to specify the video codec. The
 list of available codes can be found in [fourcc.org](http://www.fourcc.org/codecs.php). It is
-platform dependent. Following codecs works fine for me.
+platform dependent. The following codecs work fine for me.

 -   In Fedora: DIVX, XVID, MJPG, X264, WMV1, WMV2. (XVID is more preferable. MJPG results in high
    size video. X264 gives very small size video)
@ -122,7 +122,7 @@ platform dependent. Following codecs works fine for me.
 FourCC code is passed as `cv.VideoWriter_fourcc('M','J','P','G')` or
 `cv.VideoWriter_fourcc(*'MJPG')` for MJPG.

-Below code capture from a Camera, flip every frame in vertical direction and saves it.
+The below code captures from a camera, flips every frame in the vertical direction, and saves the video.
@code{.py}
 import numpy as np
 import cv2 as cv
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@ -216,30 +216,30 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
        dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
        x = 1;

-#if CV_SIMD128
+#if CV_SIMD
        {
-            v_int16x8 ftz = v_setall_s16((short) ftzero);
-            v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2));
-            v_int16x8 z = v_setzero_s16();
+            v_int16 ftz = vx_setall_s16((short) ftzero);
+            v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
+            v_int16 z = vx_setzero_s16();

-            for(; x <= (size.width - 1) - 8; x += 8 )
+            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
            {
-                v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1));
-                v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1));
-                v_int16x8 s10 = v_reinterpret_as_s16(v_load_expand(srow1 + x + 1));
-                v_int16x8 s11 = v_reinterpret_as_s16(v_load_expand(srow1 + x - 1));
-                v_int16x8 s20 = v_reinterpret_as_s16(v_load_expand(srow2 + x + 1));
-                v_int16x8 s21 = v_reinterpret_as_s16(v_load_expand(srow2 + x - 1));
-                v_int16x8 s30 = v_reinterpret_as_s16(v_load_expand(srow3 + x + 1));
-                v_int16x8 s31 = v_reinterpret_as_s16(v_load_expand(srow3 + x - 1));
+                v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
+                v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
+                v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
+                v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
+                v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
+                v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
+                v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
+                v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));

-                v_int16x8 d0 = s00 - s01;
-                v_int16x8 d1 = s10 - s11;
-                v_int16x8 d2 = s20 - s21;
-                v_int16x8 d3 = s30 - s31;
+                v_int16 d0 = s00 - s01;
+                v_int16 d1 = s10 - s11;
+                v_int16 d2 = s20 - s21;
+                v_int16 d3 = s30 - s31;

-                v_uint16x8 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
-                v_uint16x8 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
+                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
+                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));

                v_pack_store(dptr0 + x, v0);
                v_pack_store(dptr1 + x, v1);
@ -262,10 +262,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
    {
        uchar* dptr = dst.ptr<uchar>(y);
        x = 0;
-#if CV_SIMD128
+#if CV_SIMD
        {
-            v_uint8x16 val0_16 = v_setall_u8(val0);
-            for(; x <= size.width-16; x+=16 )
+            v_uint8 val0_16 = vx_setall_u8(val0);
+            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
                v_store(dptr + x, val0_16);
        }
 #endif
@ -309,13 +309,13 @@ inline int dispDescale(int v1, int v2, int d)
    return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
 }

-#if CV_SIMD128
+#if CV_SIMD
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                            Mat& disp, Mat& cost, StereoBMParams& state,
                                            uchar* buf, int _dy0, int _dy1 )
 {
-    const int ALIGN = 16;
+    const int ALIGN = CV_SIMD_WIDTH;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -345,7 +345,9 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const int TABSZ = 256;
    uchar tab[TABSZ];
-    const v_int16x8 d0_8 = v_int16x8(0,1,2,3,4,5,6,7), dd_8 = v_setall_s16(8);
+    short v_seq[v_int16::nlanes];
+    for (short i = 0; i < v_int16::nlanes; ++i)
+        v_seq[i] = i;

    sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
@ -368,20 +370,26 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
-            v_uint8x16 lv = v_setall_u8((uchar)lval);
-            for( d = 0; d < ndisp; d += 16 )
+            v_uint8 lv = vx_setall_u8((uchar)lval);
+            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
            {
-                v_uint8x16 rv = v_load(rptr + d);
-                v_uint16x8 hsad_l = v_load(hsad + d);
-                v_uint16x8 hsad_h = v_load(hsad + d + 8);
-                v_uint8x16 diff = v_absdiff(lv, rv);
+                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                v_store(cbuf + d, diff);
-                v_uint16x8 diff0, diff1;
-                v_expand(diff, diff0, diff1);
-                hsad_l += diff0;
-                hsad_h += diff1;
-                v_store(hsad + d, hsad_l);
-                v_store(hsad + d + 8, hsad_h);
+                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
+                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
+            }
+            if( d <= ndisp - v_uint16::nlanes )
+            {
+                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
+                v_store_low(cbuf + d, diff);
+                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
+                d += v_uint16::nlanes;
+            }
+            for( ; d < ndisp; d++ )
+            {
+                int diff = abs(lval - rptr[d]);
+                cbuf[d] = (uchar)diff;
+                hsad[d] += (ushort)diff;
            }
            htext[y] += tab[lval];
        }
@ -412,24 +420,27 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
            hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
        {
            int lval = lptr[0];
-            v_uint8x16 lv = v_setall_u8((uchar)lval);
-            for( d = 0; d < ndisp; d += 16 )
+            v_uint8 lv = vx_setall_u8((uchar)lval);
+            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
            {
-                v_uint8x16 rv = v_load(rptr + d);
-                v_uint16x8 hsad_l = v_load(hsad + d);
-                v_uint16x8 hsad_h = v_load(hsad + d + 8);
-                v_uint8x16 cbs = v_load(cbuf_sub + d);
-                v_uint8x16 diff = v_absdiff(lv, rv);
-                v_int16x8 diff_l, diff_h, cbs_l, cbs_h;
+                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
+                v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                v_store(cbuf + d, diff);
-                v_expand(v_reinterpret_as_s8(diff), diff_l, diff_h);
-                v_expand(v_reinterpret_as_s8(cbs), cbs_l, cbs_h);
-                diff_l -= cbs_l;
-                diff_h -= cbs_h;
-                hsad_h = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_h) + diff_h);
-                hsad_l = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_l) + diff_l);
-                v_store(hsad + d, hsad_l);
-                v_store(hsad + d + 8, hsad_h);
+                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
+                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
+            }
+            if( d <= ndisp - v_uint16::nlanes)
+            {
+                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
+                v_store_low(cbuf + d, diff);
+                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
+                d += v_uint16::nlanes;
+            }
+            for( ; d < ndisp; d++ )
+            {
+                int diff = abs(lval - rptr[d]);
+                cbuf[d] = (uchar)diff;
+                hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
            }
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }
@ -446,17 +457,25 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
-            for( d = 0; d <= ndisp-16; d += 16 )
+        {
+            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
            {
-                v_uint16x8 s0 = v_load(sad + d);
-                v_uint16x8 s1 = v_load(sad + d + 8);
-                v_uint16x8 t0 = v_load(hsad + d);
-                v_uint16x8 t1 = v_load(hsad + d + 8);
-                s0 = s0 + t0;
-                s1 = s1 + t1;
-                v_store(sad + d, s0);
-                v_store(sad + d + 8, s1);
+                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
+                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
            }
+            if( d <= ndisp-v_uint16::nlanes )
+            {
+                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
+                d += v_uint16::nlanes;
+            }
+            if( d <= ndisp-v_uint16::nlanes/2 )
+            {
+                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
+                d += v_uint16::nlanes/2;
+            }
+            for( ; d < ndisp; d++ )
+                sad[d] = sad[d] + hsad[d];
+        }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];
@ -467,38 +486,41 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
-            v_int16x8 minsad8 = v_setall_s16(SHRT_MAX);
-            v_int16x8 mind8 = v_setall_s16(0), d8 = d0_8;
+            v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
+            v_int16 mind8 = vx_setall_s16(0);

-            for( d = 0; d < ndisp; d += 16 )
+            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
            {
-                v_int16x8 u0 = v_reinterpret_as_s16(v_load(hsad_sub + d));
-                v_int16x8 u1 = v_reinterpret_as_s16(v_load(hsad + d));
+                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_store(sad + d, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                minsad8 = v_min(minsad8, sad8);

-                v_int16x8 v0 = v_reinterpret_as_s16(v_load(hsad_sub + d + 8));
-                v_int16x8 v1 = v_reinterpret_as_s16(v_load(hsad + d + 8));
-
-                v_int16x8 usad8 = v_reinterpret_as_s16(v_load(sad + d));
-                v_int16x8 vsad8 = v_reinterpret_as_s16(v_load(sad + d + 8));
-
-                u1 -= u0;
-                v1 -= v0;
-                usad8 += u1;
-                vsad8 += v1;
-
-                v_int16x8 mask = minsad8 > usad8;
-                minsad8 = v_min(minsad8, usad8);
-                mind8 = v_max(mind8, (mask& d8));
-
-                v_store(sad + d, v_reinterpret_as_u16(usad8));
-                v_store(sad + d + 8, v_reinterpret_as_u16(vsad8));
-
-                mask = minsad8 > vsad8;
-                minsad8 = v_min(minsad8, vsad8);
-
-                d8 = d8 + dd_8;
-                mind8 = v_max(mind8, (mask & d8));
-                d8 = d8 + dd_8;
+                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
+                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
+                minsad8 = v_min(minsad8, sad8);
+            }
+            if( d <= ndisp - v_int16::nlanes )
+            {
+                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_store(sad + d, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                minsad8 = v_min(minsad8, sad8);
+                d += v_int16::nlanes;
+            }
+            minsad = v_reduce_min(minsad8);
+            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
+            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+            for( ; d < ndisp; d++ )
+            {
+                int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
+                sad[d] = (ushort)sad8;
+                if(minsad > sad8)
+                {
+                    mind = d;
+                    minsad = sad8;
+                }
            }

            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
@ -508,41 +530,45 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                continue;
            }

-            ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
-            v_store(minsad_buf, v_reinterpret_as_u16(minsad8));
-            v_store(mind_buf, v_reinterpret_as_u16(mind8));
-            for( d = 0; d < 8; d++ )
-                if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
-                {
-                    minsad = minsad_buf[d];
-                    mind = mind_buf[d];
-                }
-
            if( uniquenessRatio > 0 )
            {
                int thresh = minsad + (minsad * uniquenessRatio/100);
-                v_int32x4 thresh4 = v_setall_s32(thresh + 1);
-                v_int32x4 d1 = v_setall_s32(mind-1), d2 = v_setall_s32(mind+1);
-                v_int32x4 dd_4 = v_setall_s32(4);
-                v_int32x4 d4 = v_int32x4(0,1,2,3);
-                v_int32x4 mask4;
+                v_int32 thresh4 = vx_setall_s32(thresh + 1);
+                v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
+                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
+                v_int32 d4 = vx_load_expand(v_seq);

-                for( d = 0; d < ndisp; d += 8 )
+                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
                {
-                    v_int16x8 sad8 = v_reinterpret_as_s16(v_load(sad + d));
-                    v_int32x4 sad4_l, sad4_h;
-                    v_expand(sad8, sad4_l, sad4_h);
-                    mask4 = thresh4 > sad4_l;
-                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_check_any(mask4) )
+                    v_int32 sad4_l, sad4_h;
+                    v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
+                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
                        break;
                    d4 += dd_4;
-                    mask4 = thresh4 > sad4_h;
-                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_check_any(mask4) )
+                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
                        break;
                    d4 += dd_4;
                }
+                if( d <= ndisp - v_int16::nlanes )
+                {
+                    dptr[y*dstep] = FILTERED;
+                    continue;
+                }
+                if( d <= ndisp - v_int32::nlanes )
+                {
+                    v_int32 sad4_l = vx_load_expand((short*)sad + d);
+                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
+                    {
+                        dptr[y*dstep] = FILTERED;
+                        continue;
+                    }
+                    d += v_int16::nlanes;
+                }
+                for( ; d < ndisp; d++ )
+                {
+                    if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
+                        break;
+                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
@ -571,7 +597,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                            uchar* buf, int _dy0, int _dy1 )
 {

-    const int ALIGN = 16;
+    const int ALIGN = CV_SIMD_WIDTH;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -587,12 +613,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    const int disp_shift = dispShiftTemplate<mType>::value;
    mType FILTERED = (mType)((mindisp - 1) << disp_shift);

-#if CV_SIMD128
-    {
-        CV_Assert (ndisp % 8 == 0);
-    }
-#endif
-
    int *sad, *hsad0, *hsad, *hsad_sub, *htext;
    uchar *cbuf0, *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
@ -607,6 +627,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    const int TABSZ = 256;
    uchar tab[TABSZ];

+#if CV_SIMD
+    int v_seq[v_int32::nlanes];
+    for (int i = 0; i < v_int32::nlanes; ++i)
+        v_seq[i] = i;
+    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
+#endif
+
    sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
@ -628,22 +655,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            d = 0;
-#if CV_SIMD128
+#if CV_SIMD
            {
-                v_uint8x16 lv = v_setall_u8((uchar)lval);
+                v_uint8 lv = vx_setall_u8((uchar)lval);

-                for( ; d <= ndisp - 16; d += 16 )
+                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                {
-                    v_uint8x16 rv = v_load(rptr + d);
-                    v_int32x4 hsad_0 = v_load(hsad + d);
-                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
-                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
-                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
-                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_uint8 rv = vx_load(rptr + d);
+                    v_int32 hsad_0 = vx_load(hsad + d);
+                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);

-                    v_uint16x8 diff0, diff1;
-                    v_uint32x4 diff00, diff01, diff10, diff11;
+                    v_uint16 diff0, diff1;
+                    v_uint32 diff00, diff01, diff10, diff11;
                    v_expand(diff, diff0, diff1);
                    v_expand(diff0, diff00, diff01);
                    v_expand(diff1, diff10, diff11);
@ -654,9 +681,9 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                    hsad_3 += v_reinterpret_as_s32(diff11);

                    v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + 4, hsad_1);
-                    v_store(hsad + d + 8, hsad_2);
-                    v_store(hsad + d + 12, hsad_3);
+                    v_store(hsad + d + v_int32::nlanes, hsad_1);
+                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
+                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                }
            }
 #endif
@ -696,22 +723,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            d = 0;
-#if CV_SIMD128
+#if CV_SIMD
            {
-                v_uint8x16 lv = v_setall_u8((uchar)lval);
-                for( ; d <= ndisp - 16; d += 16 )
+                v_uint8 lv = vx_setall_u8((uchar)lval);
+                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                {
-                    v_uint8x16 rv = v_load(rptr + d);
-                    v_int32x4 hsad_0 = v_load(hsad + d);
-                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
-                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
-                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
-                    v_uint8x16 cbs = v_load(cbuf_sub + d);
-                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_uint8 rv = vx_load(rptr + d);
+                    v_int32 hsad_0 = vx_load(hsad + d);
+                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_uint8 cbs = vx_load(cbuf_sub + d);
+                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);

-                    v_uint16x8 diff0, diff1, cbs0, cbs1;
-                    v_int32x4 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
+                    v_uint16 diff0, diff1, cbs0, cbs1;
+                    v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
                    v_expand(diff, diff0, diff1);
                    v_expand(cbs, cbs0, cbs1);
                    v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
@ -719,19 +746,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                    v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                    v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);

-                    v_int32x4 diff_0 = diff00 - cbs00;
-                    v_int32x4 diff_1 = diff01 - cbs01;
-                    v_int32x4 diff_2 = diff10 - cbs10;
-                    v_int32x4 diff_3 = diff11 - cbs11;
+                    v_int32 diff_0 = diff00 - cbs00;
+                    v_int32 diff_1 = diff01 - cbs01;
+                    v_int32 diff_2 = diff10 - cbs10;
+                    v_int32 diff_3 = diff11 - cbs11;
                    hsad_0 += diff_0;
                    hsad_1 += diff_1;
                    hsad_2 += diff_2;
                    hsad_3 += diff_3;

                    v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + 4, hsad_1);
-                    v_store(hsad + d + 8, hsad_2);
-                    v_store(hsad + d + 12, hsad_3);
+                    v_store(hsad + d + v_int32::nlanes, hsad_1);
+                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
+                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                }
            }
 #endif
@ -758,18 +785,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
            d = 0;
-#if CV_SIMD128
+#if CV_SIMD
            {
-                for( d = 0; d <= ndisp-8; d += 8 )
+                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
                {
-                    v_int32x4 s0 = v_load(sad + d);
-                    v_int32x4 s1 = v_load(sad + d + 4);
-                    v_int32x4 t0 = v_load(hsad + d);
-                    v_int32x4 t1 = v_load(hsad + d + 4);
+                    v_int32 s0 = vx_load(sad + d);
+                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
+                    v_int32 t0 = vx_load(hsad + d);
+                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
                    s0 += t0;
                    s1 += t1;
                    v_store(sad + d, s0);
-                    v_store(sad + d + 4, s1);
+                    v_store(sad + d + v_int32::nlanes, s1);
                }
            }
 #endif
@ -787,50 +814,31 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
            d = 0;
-#if CV_SIMD128
+#if CV_SIMD
            {
-                v_int32x4 d0_4 = v_int32x4(0, 1, 2, 3);
-                v_int32x4 dd_4 = v_setall_s32(4);
-                v_int32x4 minsad4 = v_setall_s32(INT_MAX);
-                v_int32x4 mind4 = v_setall_s32(0), d4 = d0_4;
+                v_int32 minsad4 = vx_setall_s32(INT_MAX);
+                v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;

-                for( ; d <= ndisp - 8; d += 8 )
+                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
                {
-                    v_int32x4 u0 = v_load(hsad_sub + d);
-                    v_int32x4 u1 = v_load(hsad + d);
-
-                    v_int32x4 v0 = v_load(hsad_sub + d + 4);
-                    v_int32x4 v1 = v_load(hsad + d + 4);
-
-                    v_int32x4 usad4 = v_load(sad + d);
-                    v_int32x4 vsad4 = v_load(sad + d + 4);
-
-                    u1 -= u0;
-                    v1 -= v0;
-                    usad4 += u1;
-                    vsad4 += v1;
-
-                    v_store(sad + d, usad4);
-                    v_store(sad + d + 4, vsad4);
-
-                    v_int32x4 mask = minsad4 > usad4;
-                    minsad4 = v_min(minsad4, usad4);
-                    mind4 = v_select(mask, d4, mind4);
+                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
+                    v_store(sad + d, sad4);
+                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    minsad4 = v_min(minsad4, sad4);
                    d4 += dd_4;

-                    mask = minsad4 > vsad4;
-                    minsad4 = v_min(minsad4, vsad4);
-                    mind4 = v_select(mask, d4, mind4);
+                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
+                    v_store(sad + d + v_int32::nlanes, sad4);
+                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    minsad4 = v_min(minsad4, sad4);
                    d4 += dd_4;
                }

-                int CV_DECL_ALIGNED(16) minsad_buf[4], mind_buf[4];
+                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
                v_store(minsad_buf, minsad4);
                v_store(mind_buf, mind4);
-                if(minsad_buf[0] < minsad || (minsad == minsad_buf[0] && mind_buf[0] < mind)) { minsad = minsad_buf[0]; mind = mind_buf[0]; }
-                if(minsad_buf[1] < minsad || (minsad == minsad_buf[1] && mind_buf[1] < mind)) { minsad = minsad_buf[1]; mind = mind_buf[1]; }
-                if(minsad_buf[2] < minsad || (minsad == minsad_buf[2] && mind_buf[2] < mind)) { minsad = minsad_buf[2]; mind = mind_buf[2]; }
-                if(minsad_buf[3] < minsad || (minsad == minsad_buf[3] && mind_buf[3] < mind)) { minsad = minsad_buf[3]; mind = mind_buf[3]; }
+                for (int i = 0; i < v_int32::nlanes; ++i)
+                    if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
            }
 #endif
            for( ; d < ndisp; d++ )
@ -1027,7 +1035,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
        Mat disp_i = disp->rowRange(row0, row1);
        Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();

-#if CV_SIMD128
+#if CV_SIMD
        if (useShorts)
        {
            if( disp_i.type() == CV_16S)
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1012,6 +1012,54 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps
 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)

+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 ////////// Reduce and mask /////////

 /** Reduce **/
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@ -1068,6 +1068,79 @@ OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)

+/** Reverse **/
+inline v_uint8x64 v_reverse(const v_uint8x64 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
+            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
+            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int8x64 v_reverse(const v_int8x64 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x32 v_reverse(const v_uint16x32 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00000001, 0x00020003, 0x00040005, 0x00060007,
+            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
+            0x00100011, 0x00120013, 0x00140015, 0x00160017,
+            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
+    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int16x32 v_reverse(const v_int16x32 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x16 v_reverse(const v_uint32x16 &a)
+{
+    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+inline v_int32x16 v_reverse(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x16 v_reverse(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x8 v_reverse(const v_uint64x8 &a)
+{
+    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
+}
+
+inline v_int64x8 v_reverse(const v_int64x8 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x8 v_reverse(const v_float64x8 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 ////////// Reduce /////////

 /** Reduce **/
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -112,6 +112,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
+- Reverse: @ref v_reverse
 - Extract: @ref v_extract


@ -215,6 +216,7 @@ Regular integers:
 |cvt_flt32          |   |   |   |   |   | x |
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
+|reverse            | x | x | x | x | x | x |

 Big integers:

@ -224,6 +226,7 @@ Big integers:
 |add, sub           | x | x |
 |shift              | x | x |
 |logical            | x | x |
+|reverse            | x | x |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
 |cvt_flt64          |   | x |
@ -250,6 +253,7 @@ Floating point:
 |transpose4x4       | x |   |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
+|reverse            | x | x |

 @{ */

@ -1724,6 +1728,23 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    }
 }

+/** @brief Vector reverse order
+
+Reverse the order of the vector
+Scheme:
+@code
+  REG {A1 ... An} ==> REG {An ... A1}
+@endcode
+For all types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[n-i-1];
+    return c;
+}
+
 /** @brief Vector extract

 Scheme:
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@ -906,6 +906,57 @@ OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)

+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
+    return c;
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
+    return c;
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    v_uint32x4 c;
+    c.val[0] = a.val[3];
+    c.val[1] = a.val[2];
+    c.val[2] = a.val[1];
+    c.val[3] = a.val[0];
+    return c;
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    v_uint64x2 c;
+    c.val[0] = a.val[1];
+    c.val[1] = a.val[0];
+    return c;
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
 inline unsigned short v_reduce_##func(const v_uint16x8& a) \
 { \
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1585,6 +1585,52 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
 #endif

+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    uint8x16_t vec = vrev64q_u8(a.val);
+    return v_uint8x16(vextq_u8(vec, vec, 8));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    uint16x8_t vec = vrev64q_u16(a.val);
+    return v_uint16x8(vextq_u16(vec, vec, 4));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    uint32x4_t vec = vrev64q_u32(a.val);
+    return v_uint32x4(vextq_u32(vec, vec, 2));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    uint64x2_t vec = a.val;
+    uint64x1_t vec_lo = vget_low_u64(vec);
+    uint64x1_t vec_hi = vget_high_u64(vec);
+    return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
 template <int s> \
 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -1220,14 +1220,23 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)

-#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+{ return ~(a == b); }
+#else
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
+  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#endif

-OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
-OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)

 inline v_float32x4 v_not_nan(const v_float32x4& a)
 { return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
@ -1914,6 +1923,59 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)

+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
+#else
+    uchar CV_DECL_ALIGNED(32) d[16];
+    v_store_aligned(d, a);
+    return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+#endif
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
+#else
+    __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    return v_uint16x8(r);
+#endif
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -678,6 +678,53 @@ OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)

+/* Reverse */
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_uint8x16(vec_perm(vec, vec, perm));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 /* Extract */
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@ -21,6 +21,18 @@ namespace cv

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
+// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
+#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
+#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
+#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
+#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
+#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
+#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
+#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
+#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
+#endif // COMPATIBILITY: <1.38.46
+
 ///////// Types ///////////

 struct v_uint8x16
@ -3111,6 +3123,38 @@ OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)


+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
@ -3400,25 +3444,25 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 inline v_int32x4 v_round(const v_float32x4& a)
 {
    v128_t h = wasm_f32x4_splat(0.5);
-    return v_int32x4(wasm_trunc_saturate_i32x4_f32x4(wasm_f32x4_add(a.val, h)));
+    return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
 }

 inline v_int32x4 v_floor(const v_float32x4& a)
 {
-    v128_t a1 = wasm_trunc_saturate_i32x4_f32x4(a.val);
-    v128_t mask = wasm_f32x4_lt(a.val, wasm_convert_f32x4_i32x4(a1));
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
    return v_int32x4(wasm_i32x4_add(a1, mask));
 }

 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
-    v128_t a1 = wasm_trunc_saturate_i32x4_f32x4(a.val);
-    v128_t mask = wasm_f32x4_gt(a.val, wasm_convert_f32x4_i32x4(a1));
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
    return v_int32x4(wasm_i32x4_sub(a1, mask));
 }

 inline v_int32x4 v_trunc(const v_float32x4& a)
-{ return v_int32x4(wasm_trunc_saturate_i32x4_f32x4(a.val)); }
+{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }

 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc, _Tpvec, _Tpnvec, _Tp, _Tpn) \
 inline _Tpnvec func(const _Tpvec& a) \
@ -3924,7 +3968,7 @@ OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2,

 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
-    return v_float32x4(wasm_convert_f32x4_i32x4(a.val));
+    return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
 }

 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
@ -3943,7 +3987,7 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
 #ifdef __wasm_unimplemented_simd128__
    v128_t p = v128_cvti32x4_i64x2(a.val);
-    return v_float64x2(wasm_convert_f64x2_i64x2(p));
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
    fallback::v_int32x4 a_(a);
    return fallback::v_cvt_f64(a_);
@ -3954,7 +3998,7 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
 #ifdef __wasm_unimplemented_simd128__
    v128_t p = v128_cvti32x4_i64x2_high(a.val);
-    return v_float64x2(wasm_convert_f64x2_i64x2(p));
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
    fallback::v_int32x4 a_(a);
    return fallback::v_cvt_f64_high(a_);
@ -3976,7 +4020,7 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 {
 #ifdef __wasm_unimplemented_simd128__
-    return v_float64x2(wasm_convert_f64x2_i64x2(a.val));
+    return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
 #else
    fallback::v_int64x2 a_(a);
    return fallback::v_cvt_f64(a_);
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@ -112,6 +112,13 @@ bool isAlignedAllocationEnabled()
    }
    return useMemalign;
 }
+// do not use variable directly, details: https://github.com/opencv/opencv/issues/15691
+static const bool g_force_initialization_memalign_flag
+#if defined __GNUC__
+    __attribute__((unused))
+#endif
+    = isAlignedAllocationEnabled();
+
 #endif

 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -711,6 +711,13 @@ static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
 #ifdef HAVE_IPP_IW
    CV_INSTRUMENT_REGION_IPP();

+    // Details: https://github.com/opencv/opencv/issues/12943
+    if (flip_mode <= 0 /* swap rows */
+        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
+        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
+    )
+        return false;
+
    IppiAxis ippMode;
    if(flip_mode < 0)
        ippMode = ippAxsBoth;
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )

 static int countNonZero64f( const double* src, int len )
 {
-    return countNonZero_(src, len);
+    int nz = 0, i = 0;
+#if CV_SIMD_64F
+    v_int64 sum1 = vx_setzero_s64();
+    v_int64 sum2 = vx_setzero_s64();
+    v_float64 zero = vx_setzero_f64();
+    int step = v_float64::nlanes * 2;
+    int len0 = len & -step;
+
+    for(i = 0; i < len0; i += step )
+        {
+        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
+        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
+        }
+
+    // N.B the value is incremented by -1 (0xF...F) for each value
+    nz = i + (int)v_reduce_sum(sum1 + sum2);
+    v_cleanup();
+#endif
+    return nz + countNonZero_(src + i, len - i);
 }

 CountNonZeroFunc getCountNonZeroTab(int depth)
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1115,6 +1115,22 @@ template<typename R> struct TheTest
        return *this;
    }

+    TheTest & test_reverse()
+    {
+        Data<R> dataA;
+        R a = dataA;
+
+        Data<R> resB = v_reverse(a);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]);
+        }
+
+        return *this;
+    }
+
    template<int s>
    TheTest & test_extract()
    {
@ -1426,6 +1442,50 @@ template<typename R> struct TheTest
        return *this;
    }
 #endif
+
+#if CV_SIMD_64F
+    TheTest & test_cmp64()
+    {
+        Data<R> dataA, dataB;
+        R a = dataA, b = dataB;
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            dataA[i] = dataB[i];
+        }
+        dataA[0]++;
+
+        a = dataA, b = dataB;
+
+        Data<R> resC = (a == b);
+        Data<R> resD = (a != b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
+            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
+        }
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            dataA[i] = dataB[i] = (LaneType)-1;
+        }
+
+        a = dataA, b = dataB;
+
+        resC = (a == b);
+        resD = (a != b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
+            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
+        }
+        return *this;
+    }
+#endif
 };


@ -1459,6 +1519,7 @@ void test_hal_intrin_uint8()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
        .test_pack_b()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
@ -1497,6 +1558,7 @@ void test_hal_intrin_int8()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
@ -1529,6 +1591,7 @@ void test_hal_intrin_uint16()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
@ -1561,6 +1624,7 @@ void test_hal_intrin_int16()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
@ -1590,6 +1654,7 @@ void test_hal_intrin_uint32()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_transpose()
@ -1619,6 +1684,7 @@ void test_hal_intrin_int32()
        .test_mask()
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_float_cvt32()
@ -1635,8 +1701,12 @@ void test_hal_intrin_uint64()
    TheTest<v_uint64>()
        .test_loadstore()
        .test_addsub()
+#if CV_SIMD_64F
+        .test_cmp64()
+#endif
        .test_shift<1>().test_shift<8>()
        .test_logic()
+        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
@ -1648,8 +1718,12 @@ void test_hal_intrin_int64()
    TheTest<v_int64>()
        .test_loadstore()
        .test_addsub()
+#if CV_SIMD_64F
+        .test_cmp64()
+#endif
        .test_shift<1>().test_shift<8>()
        .test_logic()
+        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_cvt64_double()
@ -1680,6 +1754,7 @@ void test_hal_intrin_float32()
        .test_matmul()
        .test_transpose()
        .test_reduce_sum4()
+        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        ;
@ -1709,6 +1784,7 @@ void test_hal_intrin_float64()
        .test_unpack()
        .test_float_math()
        .test_float_cvt32()
+        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        ;
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -2025,4 +2025,17 @@ TEST(Core_Eigen, eigen2cv_check_Mat_type)
 }
 #endif // HAVE_EIGEN

+TEST(Mat, regression_12943)  // memory usage: ~4.5 Gb
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_6GB);
+
+    const int width = 0x8000;
+    const int height = 0x10001;
+
+    cv::Mat src(height, width, CV_8UC1, Scalar::all(128));
+
+    cv::Mat dst;
+    cv::flip(src, dst, 0);
+}
+
 }} // namespace
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP

 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20190902
+#define OPENCV_DNN_API_VERSION 20191024

 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -128,7 +128,7 @@ namespace cv {


                void setConvolution(int kernel, int pad, int stride,
-                    int filters_num, int channels_num, int use_batch_normalize, int use_relu)
+                    int filters_num, int channels_num, int use_batch_normalize)
                {
                    cv::dnn::LayerParams conv_param =
                        getParamConvolution(kernel, pad, stride, filters_num);
@ -168,27 +168,29 @@ namespace cv {
                        net->layers.push_back(lp);
                    }

-                    if (use_relu)
-                    {
-                        cv::dnn::LayerParams activation_param;
-                        activation_param.set<float>("negative_slope", 0.1f);
-                        activation_param.name = "ReLU-name";
-                        activation_param.type = "ReLU";
-
-                        darknet::LayerParameter lp;
-                        std::string layer_name = cv::format("relu_%d", layer_id);
-                        lp.layer_name = layer_name;
-                        lp.layer_type = activation_param.type;
-                        lp.layerParams = activation_param;
-                        lp.bottom_indexes.push_back(last_layer);
-                        last_layer = layer_name;
-                        net->layers.push_back(lp);
-                    }
-
                    layer_id++;
                    fused_layer_names.push_back(last_layer);
                }

+                void setReLU()
+                {
+                    cv::dnn::LayerParams activation_param;
+                    activation_param.set<float>("negative_slope", 0.1f);
+                    activation_param.name = "ReLU-name";
+                    activation_param.type = "ReLU";
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("relu_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = activation_param.type;
+                    lp.layerParams = activation_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    fused_layer_names.back() = last_layer;
+                }
+
                void setMaxpool(size_t kernel, size_t pad, size_t stride)
                {
                    cv::dnn::LayerParams maxpool_param;
@ -409,12 +411,19 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

-                void setShortcut(int from)
+                void setShortcut(int from, float alpha)
                {
                    cv::dnn::LayerParams shortcut_param;
                    shortcut_param.name = "Shortcut-name";
                    shortcut_param.type = "Eltwise";

+                    if (alpha != 1)
+                    {
+                        std::vector<float> coeffs(2, 1);
+                        coeffs[0] = alpha;
+                        shortcut_param.set("coeff", DictValue::arrayReal<float*>(&coeffs[0], coeffs.size()));
+                    }
+
                    shortcut_param.set<std::string>("op", "sum");

                    darknet::LayerParameter lp;
@ -422,8 +431,8 @@ namespace cv {
                    lp.layer_name = layer_name;
                    lp.layer_type = shortcut_param.type;
                    lp.layerParams = shortcut_param;
-                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
                    last_layer = layer_name;
                    net->layers.push_back(lp);

@ -548,10 +557,7 @@ namespace cv {
                        int pad = getParam<int>(layer_params, "pad", 0);
                        int stride = getParam<int>(layer_params, "stride", 1);
                        int filters = getParam<int>(layer_params, "filters", -1);
-                        std::string activation = getParam<std::string>(layer_params, "activation", "linear");
                        bool batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
-                        if(activation != "linear" && activation != "leaky")
-                            CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
                        int flipped = getParam<int>(layer_params, "flipped", 0);
                        if (flipped == 1)
                            CV_Error(cv::Error::StsNotImplemented, "Transpose the convolutional weights is not implemented");
@ -563,7 +569,7 @@ namespace cv {
                        CV_Assert(current_channels > 0);

                        setParams.setConvolution(kernel_size, pad, stride, filters, current_channels,
-                            batch_normalize, activation == "leaky");
+                            batch_normalize);

                        current_channels = filters;
                    }
@ -593,7 +599,7 @@ namespace cv {

                        current_channels = 0;
                        for (size_t k = 0; k < layers_vec.size(); ++k) {
-                            layers_vec[k] = layers_vec[k] > 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
+                            layers_vec[k] = layers_vec[k] >= 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
                            current_channels += net->out_channels_vec[layers_vec[k]];
                        }

@ -631,13 +637,15 @@ namespace cv {
                    else if (layer_type == "shortcut")
                    {
                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        float alpha = getParam<float>(layer_params, "alpha", 1);
+                        float beta = getParam<float>(layer_params, "beta", 0);
+                        if (beta != 0)
+                            CV_Error(Error::StsNotImplemented, "Non-zero beta");
                        CV_Assert(!bottom_layer.empty());
                        int from = std::atoi(bottom_layer.c_str());

-                        from += layers_counter;
-                        current_channels = net->out_channels_vec[from];
-
-                        setParams.setShortcut(from);
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setShortcut(from, alpha);
                    }
                    else if (layer_type == "upsample")
                    {
@ -667,6 +675,15 @@ namespace cv {
                    else {
                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
                    }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if (activation == "leaky")
+                    {
+                        setParams.setReLU();
+                    }
+                    else if (activation != "linear")
+                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
+
                    net->out_channels_vec[layers_counter] = current_channels;
                }

@ -710,7 +727,6 @@ namespace cv {
                    {
                        int kernel_size = getParam<int>(layer_params, "size", -1);
                        int filters = getParam<int>(layer_params, "filters", -1);
-                        std::string activation = getParam<std::string>(layer_params, "activation", "linear");
                        bool use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;

                        CV_Assert(kernel_size > 0 && filters > 0);
@ -754,14 +770,16 @@ namespace cv {
                            bn_blobs.push_back(biasData_mat);
                            setParams.setLayerBlobs(cv_layers_counter, bn_blobs);
                        }
-
-                        if(activation == "leaky")
-                            ++cv_layers_counter;
                    }
                    if (layer_type == "region" || layer_type == "yolo")
                    {
                        ++cv_layers_counter;  // For permute.
                    }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if(activation == "leaky")
+                        ++cv_layers_counter;  // For ReLU
+
                    current_channels = net->out_channels_vec[darknet_layers_counter];
                }
                return true;
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -103,6 +103,37 @@ public:
        static BackendRegistry impl;
        return impl;
    }
+
+    static inline bool checkIETarget(int target)
+    {
+#ifndef HAVE_INF_ENGINE
+        return false;
+#else
+        cv::dnn::Net net;
+        cv::dnn::LayerParams lp;
+        lp.set("kernel_size", 1);
+        lp.set("num_output", 1);
+        lp.set("bias_term", false);
+        lp.type = "Convolution";
+        lp.name = "testLayer";
+        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
+        net.setPreferableTarget(target);
+        static int inpDims[] = {1, 2, 3, 4};
+        net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
+        try
+        {
+            net.forward();
+        }
+        catch(...)
+        {
+            return false;
+        }
+        return true;
+#endif
+    }
+
 private:
    BackendRegistry()
    {
@ -154,35 +185,6 @@ private:
        }
 #endif
    }
-    static inline bool checkIETarget(int target)
-    {
-#ifndef HAVE_INF_ENGINE
-        return false;
-#else
-        cv::dnn::Net net;
-        cv::dnn::LayerParams lp;
-        lp.set("kernel_size", 1);
-        lp.set("num_output", 1);
-        lp.set("bias_term", false);
-        lp.type = "Convolution";
-        lp.name = "testLayer";
-        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
-        net.addLayerToPrev(lp.name, lp.type, lp);
-        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
-        net.setPreferableTarget(target);
-        static int inpDims[] = {1, 2, 3, 4};
-        net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
-        try
-        {
-            net.forward();
-        }
-        catch(...)
-        {
-            return false;
-        }
-        return true;
-#endif
-    }

    BackendsList backends;
 };
@ -1689,6 +1691,9 @@ struct Net::Impl
        // backend. Split a whole model on several Inference Engine networks if
        // some of layers are not implemented.

+        bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
+                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU);
+
        // Set of all input and output blobs wrappers for current network.
        std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
        for (it = layers.begin(); it != layers.end(); ++it)
@ -1702,7 +1707,8 @@ struct Net::Impl
            if (!fused && !layer->supportBackend(preferableBackend))
            {
                bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
-                                    INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
+                                    INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2) &&
+                                    supportsCPUFallback;
                // TODO: there is a bug in Myriad plugin with custom layers shape infer.
                if (preferableTarget == DNN_TARGET_MYRIAD)
                {
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -70,6 +70,7 @@ public:
        MAX = 2,
    } op;
    std::vector<float> coeffs;
+    bool variableChannels;

    EltwiseLayerImpl(const LayerParams& params)
    {
@ -105,7 +106,7 @@ public:
        return backendId == DNN_BACKEND_OPENCV ||
               backendId == DNN_BACKEND_CUDA ||
               backendId == DNN_BACKEND_HALIDE ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE && !variableChannels &&
                (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
    }

@ -115,33 +116,57 @@ public:
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
        CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
        CV_Assert(op == SUM || coeffs.size() == 0);

+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        int numChannels = inputs[0][1];
        for (int i = 1; i < inputs.size(); i++)
        {
-            CV_Assert(inputs[0] == inputs[i]);
+            CV_Assert(inputs[0][0] == inputs[i][0]);
+
+            // It's allowed for channels axis to be different.
+            for (int j = 2; j < dims; j++)
+                CV_Assert(inputs[0][j] == inputs[i][j]);
        }

        outputs.assign(1, inputs[0]);
-
+        outputs[0][1] = numChannels;
        return false;
    }

+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        variableChannels = false;
+        for (int i = 1; i < inputs.size(); ++i)
+        {
+            if (inputs[i].size[1] != inputs[0].size[1])
+            {
+                variableChannels = true;
+                break;
+            }
+        }
+    }
+
+
    class EltwiseInvoker : public ParallelLoopBody
    {
    public:
-        const Mat* srcs;
+        std::vector<const Mat*> srcs;
        int nsrcs;
        Mat* dst;
-        const std::vector<float>* coeffs;
+        std::vector<float> coeffs;
        EltwiseOp op;
        int nstripes;
        const ActivationLayer* activ;
        int channels;
        size_t planeSize;

-        EltwiseInvoker() : srcs(0), nsrcs(0), dst(0), coeffs(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
+        EltwiseInvoker() : nsrcs(0), dst(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}

        static void run(const Mat* srcs, int nsrcs, Mat& dst,
                        const std::vector<float>& coeffs, EltwiseOp op,
@ -150,15 +175,23 @@ public:
            CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
            CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);

+            EltwiseInvoker p;
+            p.srcs.resize(nsrcs);
+            p.coeffs = coeffs;
            for( int i = 0; i < nsrcs; i++ )
            {
-                CV_Assert(srcs[i].size == dst.size &&
-                          srcs[i].type() == dst.type() &&
+                p.srcs[i] = srcs + i;
+                CV_Assert(srcs[i].type() == dst.type() &&
                          srcs[i].isContinuous());
+                // Sort srcs and coefficients in the order by number of channels
+                for( int j = i; j >= 1 && p.srcs[j - 1]->size[1] < p.srcs[j]->size[1]; j-- )
+                {
+                    std::swap(p.srcs[j - 1], p.srcs[j]);
+                    if (!p.coeffs.empty())
+                        std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                }
            }

-            EltwiseInvoker p;
-            p.srcs = srcs;
            p.nsrcs = nsrcs;
            p.dst = &dst;
            p.op = op;
@ -180,7 +213,8 @@ public:
                        break;
                    }
            }
-            p.coeffs = simpleCoeffs ? 0 : &coeffs;
+            if (simpleCoeffs)
+                p.coeffs.clear();
            p.activ = activ;

            parallel_for_(Range(0, nstripes), p, nstripes);
@ -192,8 +226,8 @@ public:
            size_t stripeSize = (total + nstripes - 1)/nstripes;
            size_t stripeStart = r.start*stripeSize;
            size_t stripeEnd = std::min(r.end*stripeSize, total);
-            int c, j, k, n = nsrcs;
-            const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0;
+            int c, j, k, n;
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
            float* dstptr0 = dst->ptr<float>();
            int blockSize0 = 1 << 12, blockSize;

@ -208,14 +242,35 @@ public:
                for( c = 0; c < channels; c++ )
                {
                    size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
-                    const float* srcptr0 = srcs[0].ptr<float>() + globalDelta;
+                    const float* srcptr0 = srcs[0]->ptr<float>() + globalDelta;
                    float* dstptr = dstptr0 + globalDelta;

-                    if( op == PROD )
+                    // This code assumes that srcs are sorted in descending order by channels.
+                    for (n = 1; n < nsrcs && c < srcs[n]->size[1]; ++n) {}
+
+                    if (n == 1)
+                    {
+                        if( !coeffsptr )
+                        {
+                            for( j = 0; j < blockSize; j++ )
+                            {
+                                dstptr[j] = srcptr0[j];
+                            }
+                        }
+                        else
+                        {
+                            float c0 = coeffsptr[0];
+                            for( j = 0; j < blockSize; j++ )
+                            {
+                                dstptr[j] = c0*srcptr0[j];
+                            }
+                        }
+                    }
+                    else if( op == PROD )
                    {
                        for( k = 1; k < n; k++ )
                        {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                            for( j = 0; j < blockSize; j++ )
                            {
                                dstptr[j] = srcptr0[j]*srcptr1[j];
@ -227,7 +282,7 @@ public:
                    {
                        for( k = 1; k < n; k++ )
                        {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                            for( j = 0; j < blockSize; j++ )
                            {
                                dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
@ -239,7 +294,7 @@ public:
                    {
                        for( k = 1; k < n; k++ )
                        {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                            for( j = 0; j < blockSize; j++ )
                            {
                                dstptr[j] = srcptr0[j] + srcptr1[j];
@ -252,7 +307,7 @@ public:
                        float c0 = coeffsptr[0];
                        for( k = 1; k < n; k++ )
                        {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                            float c1 = coeffsptr[k];
                            for( j = 0; j < blockSize; j++ )
                            {
@ -279,7 +334,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;

-        if (inputs_.depth() == CV_16S && op != SUM)
+        if ((inputs_.depth() == CV_16S && op != SUM) || variableChannels)
            return false;

        inputs_.getUMatVector(inputs);
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -390,12 +390,6 @@ TEST_P(Test_Darknet_nets, YOLOv3)
 {
    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));

-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
    // batchId, classId, confidence, left, top, right, bottom
    Mat ref = (Mat_<float>(9, 7) << 0, 7,  0.952983f, 0.614622f, 0.150257f, 0.901369f, 0.289251f,  // a truck
                                    0, 1,  0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.74626f,   // a bicycle
@ -413,23 +407,35 @@ TEST_P(Test_Darknet_nets, YOLOv3)
    std::string config_file = "yolov3.cfg";
    std::string weights_file = "yolov3.weights";

+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+    {
+        scoreDiff = 0.04;
+        iouDiff = 0.2;
+    }
+#endif
+
    {
    SCOPED_TRACE("batch size 1");
    testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
    }

-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL)  // Test with 'batch size 2' is disabled for DLIE/OpenCL target
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
+#if defined(INF_ENGINE_RELEASE)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
    {
-        if (target == DNN_TARGET_OPENCL)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
-        if (target == DNN_TARGET_OPENCL_FP16)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+        if (INF_ENGINE_VER_MAJOR_LE(2018050000) && target == DNN_TARGET_OPENCL)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2018R5);
+        else if (INF_ENGINE_VER_MAJOR_EQ(2019020000))
+        {
+            if (target == DNN_TARGET_OPENCL)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+            if (target == DNN_TARGET_OPENCL_FP16)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+        }
+        else if (target == DNN_TARGET_MYRIAD &&
+                 getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
    }
 #endif

@ -444,6 +450,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets, dnnBackendsAndTargets());
 TEST_P(Test_Darknet_layers, shortcut)
 {
    testDarknetLayer("shortcut");
+    testDarknetLayer("shortcut_leaky");
+    testDarknetLayer("shortcut_unequal");
+    testDarknetLayer("shortcut_unequal_2");
 }

 TEST_P(Test_Darknet_layers, upsample)
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1493,4 +1493,62 @@ TEST(Layer_Test_Convolution, relu_fusion)
    normAssert(input, output);
 }

+typedef testing::TestWithParam<tuple<bool, tuple<Backend, Target> > > Layer_Test_Eltwise_unequal;
+TEST_P(Layer_Test_Eltwise_unequal, Accuracy)
+{
+    bool weighted = get<0>(GetParam());
+    int backendId = get<0>(get<1>(GetParam()));
+    int targetId = get<1>(get<1>(GetParam()));
+
+    if (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Eltwise";
+    lp.name = "testLayer";
+
+    const int inpShapes[][4] = {{1, 4, 2, 2}, {1, 5, 2, 2}, {1, 3, 2, 2}};
+    std::vector<String> inpNames(3);
+    std::vector<Mat> inputs(3);
+    size_t numOutValues = 1*4*2*2;  // By the first input
+
+    std::vector<float> weights(3, 1);
+    if (weighted)
+    {
+        for (int i = 0; i < inputs.size(); ++i)
+            randu(Mat(1, 1, CV_32F, &weights[i]), -1, 1);
+        lp.set("coeff", DictValue::arrayReal<float*>(&weights[0], weights.size()));
+    }
+
+    int eltwiseId = net.addLayer(lp.name, lp.type, lp);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        inputs[i].create(4, inpShapes[i], CV_32F);
+        randu(inputs[i], 0, 255);
+        inpNames[i] = format("input_%d", i);
+        net.connect(0, i, eltwiseId, i);
+    }
+    Mat ref(1, numOutValues, CV_32F, Scalar(0));
+
+    net.setInputsNames(inpNames);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        net.setInput(inputs[i], inpNames[i]);
+        if (numOutValues >= inputs[i].total())
+            ref.colRange(0, inputs[i].total()) += weights[i] * inputs[i].reshape(1, 1);
+        else
+            ref += weights[i] * inputs[i].reshape(1, 1).colRange(0, numOutValues);
+    }
+
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+    Mat out = net.forward();
+    normAssert(out.reshape(1, 1), ref);
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Eltwise_unequal, Combine(
+    testing::Bool(),
+    dnnBackendsAndTargets()
+));
+
 }} // namespace
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -394,7 +394,9 @@ enum ConnectedComponentsTypes {
    CC_STAT_WIDTH  = 2, //!< The horizontal size of the bounding box
    CC_STAT_HEIGHT = 3, //!< The vertical size of the bounding box
    CC_STAT_AREA   = 4, //!< The total area (in pixels) of the connected component
-    CC_STAT_MAX    = 5
+#ifndef CV_DOXYGEN
+    CC_STAT_MAX    = 5 //!< Max enumeration value. Used internally only for memory allocation
+#endif
 };

 //! connected components algorithm
@ -4008,7 +4010,23 @@ without self-intersections. Otherwise, the function output is undefined.
 */
 CV_EXPORTS_W bool isContourConvex( InputArray contour );

-//! finds intersection of two convex polygons
+/** @example samples/cpp/intersectExample.cpp
+Examples of how intersectConvexConvex works
+*/
+
+/** @brief Finds intersection of two convex polygons
+
+@param _p1 First polygon
+@param _p2 Second polygon
+@param _p12 Output polygon describing the intersecting area
+@param handleNested When true, an intersection is found if one of the polygons is fully enclosed in the other.
+When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
+of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
+
+@returns Absolute value of area of intersecting polygon
+
+@note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
+ */
 CV_EXPORTS_W float intersectConvexConvex( InputArray _p1, InputArray _p2,
                                          OutputArray _p12, bool handleNested = true );

--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@ -2624,11 +2624,127 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);

-            v_store(dst + x, v_dst00);
-            v_store(dst + x + step, v_dst01);
+            v_store(dst + x           , v_dst00);
+            v_store(dst + x + step    , v_dst01);
            v_store(dst + x + step * 2, v_dst10);
            v_store(dst + x + step * 3, v_dst11);
        }
+    } else {
+        const v_float32 zero = vx_setall_f32((float)0);
+        int size = len * cn;
+
+        if ( cn == 1 ){
+            for (; x <= size - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint8 v_src = vx_load(src + x);
+                v_uint8 v_mask = vx_load(mask + x);
+
+                v_uint16 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+                v_uint32 v_m00, v_m01, v_m10, v_m11;
+                v_expand(v_m0, v_m00, v_m01);
+                v_expand(v_m1, v_m10, v_m11);
+
+                v_float32 v_mf00, v_mf01, v_mf10, v_mf11;
+                v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00));
+                v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01));
+                v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
+                v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
+
+                v_uint16 v_src0, v_src1;
+                v_expand(v_src, v_src0, v_src1);
+
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+
+                v_float32 v_dst00 = vx_load(dst + x);
+                v_float32 v_dst01 = vx_load(dst + x + step);
+                v_float32 v_dst10 = vx_load(dst + x + step * 2);
+                v_float32 v_dst11 = vx_load(dst + x + step * 3);
+
+                v_mf00 = v_mf00 != zero;
+                v_mf01 = v_mf01 != zero;
+                v_mf10 = v_mf10 != zero;
+                v_mf11 = v_mf11 != zero;
+
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
+                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
+                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+
+                v_store(dst + x           , v_dst00);
+                v_store(dst + x + step    , v_dst01);
+                v_store(dst + x + step * 2, v_dst10);
+                v_store(dst + x + step * 3, v_dst11);
+            }
+        } else if ( cn == 3 )
+        {
+            for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth )
+            {
+                v_uint8 v_src0, v_src1, v_src2;
+                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
+
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+                v_expand(v_src2, v_src20, v_src21);
+
+                v_uint32 v_src000, v_src001, v_src010, v_src011, v_src100, v_src101, v_src110, v_src111, v_src200, v_src201, v_src210, v_src211;
+                v_expand(v_src00, v_src000, v_src001);
+                v_expand(v_src01, v_src010, v_src011);
+                v_expand(v_src10, v_src100, v_src101);
+                v_expand(v_src11, v_src110, v_src111);
+                v_expand(v_src20, v_src200, v_src201);
+                v_expand(v_src21, v_src210, v_src211);
+
+                v_float32 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float32 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_load_deinterleave(dst + x * cn             , v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x +     step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + 2 * step) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + 3 * step) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_uint8 v_mask = vx_load(mask + x);
+
+                v_uint16 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+                v_uint32 v_m00, v_m01, v_m10, v_m11;
+                v_expand(v_m0, v_m00, v_m01);
+                v_expand(v_m1, v_m10, v_m11);
+
+                v_float32 v_mf00, v_mf01, v_mf10, v_mf11;
+                v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00));
+                v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01));
+                v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
+                v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
+
+                v_mf00 = v_mf00 != zero;
+                v_mf01 = v_mf01 != zero;
+                v_mf10 = v_mf10 != zero;
+                v_mf11 = v_mf11 != zero;
+
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01);
+                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02);
+                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03);
+
+                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10);
+                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11);
+                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12);
+                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13);
+
+                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20);
+                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21);
+                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22);
+                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23);
+
+                v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + ( x + step * 2 ) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + ( x + step * 3 ) * cn, v_dst03, v_dst13, v_dst23);
+            }
+        }
    }
 #endif // CV_SIMD
    accW_general_(src, dst, mask, len, cn, alpha, x);
@ -2657,9 +2773,81 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);

-            v_store(dst + x, v_dst0);
+            v_store(dst + x       , v_dst0);
            v_store(dst + x + step, v_dst1);
        }
+    } else {
+        const v_float32 zero = vx_setall_f32((float)0);
+        int size = len * cn;
+        if ( cn == 1 )
+        {
+            for (; x <= size - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint16 v_src = vx_load(src + x);
+                v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x));
+
+                v_uint32 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+
+                v_float32 v_mf0, v_mf1;
+                v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
+                v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
+
+                v_uint32 v_src0, v_src1;
+                v_expand(v_src, v_src0, v_src1);
+
+                v_float32 v_dst0 = vx_load(dst + x);
+                v_float32 v_dst1 = vx_load(dst + x + step);
+
+                v_mf0 = v_mf0 != zero;
+                v_mf1 = v_mf1 != zero;
+
+                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0);
+                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1);
+
+                v_store(dst + x       , v_dst0);
+                v_store(dst + x + step, v_dst1);
+            }
+        } else if ( cn == 3 )
+        {
+            for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth )
+            {
+                v_uint16 v_src0, v_src1, v_src2;
+                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
+
+                v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x));
+
+                v_uint32 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+
+                v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+                v_expand(v_src2, v_src20, v_src21);
+
+                v_float32 v_dst00, v_dst01, v_dst02, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(dst + x * cn             , v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x +     step) * cn, v_dst01, v_dst11, v_dst21);
+
+                v_float32 v_mf0, v_mf1;
+                v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
+                v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
+
+                v_mf0 = v_mf0 != zero;
+                v_mf1 = v_mf1 != zero;
+
+                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
+                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
+                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20);
+
+                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
+                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21);
+
+                v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
+            }
+        }
    }
 #endif // CV_SIMD
    accW_general_(src, dst, mask, len, cn, alpha, x);
--- a/modules/java/jar/CMakeLists.txt
+++ b/modules/java/jar/CMakeLists.txt
@ -27,6 +27,13 @@ endif()

 set(OPENCV_JAVADOC_DESTINATION "${OpenCV_BINARY_DIR}/doc/doxygen/html/javadoc" CACHE STRING "")

+# Old Javadoc URL looks like this: https://docs.oracle.com/javase/6/docs/api/
+# New Javadoc URL looks like this: https://docs.oracle.com/en/java/javase/11/docs/api/
+set(OPENCV_JAVADOC_LINK_URL "" CACHE STRING "See details in modules/java/jar/CMakeLists.txt")
+if(OPENCV_JAVADOC_LINK_URL)
+  set(CMAKE_CONFIG_OPENCV_JAVADOC_LINK "link=\"${OPENCV_JAVADOC_LINK_URL}\"")
+endif()
+
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OPENCV_JAVA_DIR}/build.xml" @ONLY)
 list(APPEND depends "${OPENCV_JAVA_DIR}/build.xml")

--- a/modules/java/jar/build.xml.in
+++ b/modules/java/jar/build.xml.in
@ -42,7 +42,7 @@
      bottom="Generated on ${timestamp} / OpenCV @OPENCV_VCSVERSION@"
      failonerror="true"
      encoding="UTF-8" charset="UTF-8" docencoding="UTF-8"
-      link="https://docs.oracle.com/javase/6/docs/api/"
+      @CMAKE_CONFIG_OPENCV_JAVADOC_LINK@
      additionalparam="--allow-script-in-comments"
      >
      <Header>
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@ -96,7 +96,7 @@ class SamplesFindFile(NewOpenCVTests):

    def test_MissingFileException(self):
        try:
-            res = cv.samples.findFile('non_existed.file', True)
+            _res = cv.samples.findFile('non_existed.file', True)
            self.assertEqual("Dead code", 0)
        except cv.error as _e:
            pass
--- a/modules/ts/src/ts_tags.cpp
+++ b/modules/ts/src/ts_tags.cpp
@ -46,7 +46,8 @@ static std::vector<std::string>& getTestTagsSkipList()
 #if OPENCV_32BIT_CONFIGURATION
        testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_2GB);
 #else
-        testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_6GB);
+        if (!cvtest::runBigDataTests)
+            testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_6GB);
 #endif
        testSkipWithTags.push_back(CV_TEST_TAG_VERYLONG);
 #if defined(_DEBUG)
--- a/samples/cpp/intersectExample.cpp
+++ b/samples/cpp/intersectExample.cpp
@ -0,0 +1,161 @@
+/*
+ * Author: Steve Nicholson
+ *
+ * A program that illustrates intersectConvexConvex in various scenarios
+ */
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+
+// Create a vector of points describing a rectangle with the given corners
+static vector<Point> makeRectangle(Point topLeft, Point bottomRight)
+{
+    vector<Point> rectangle;
+    rectangle.push_back(topLeft);
+    rectangle.push_back(Point(bottomRight.x, topLeft.y));
+    rectangle.push_back(bottomRight);
+    rectangle.push_back(Point(topLeft.x, bottomRight.y));
+    return rectangle;
+}
+
+static vector<Point> makeTriangle(Point point1, Point point2, Point point3)
+{
+    vector<Point> triangle;
+    triangle.push_back(point1);
+    triangle.push_back(point2);
+    triangle.push_back(point3);
+    return triangle;
+}
+
+// Run intersectConvexConvex on two polygons then draw the polygons and their intersection (if there is one)
+// Return the area of the intersection
+static float drawIntersection(Mat &image, vector<Point> polygon1, vector<Point> polygon2, bool handleNested = true)
+{
+    vector<Point> intersectionPolygon;
+
+    vector<vector<Point> > polygons;
+    polygons.push_back(polygon1);
+    polygons.push_back(polygon2);
+
+    float intersectArea = intersectConvexConvex(polygon1, polygon2, intersectionPolygon, handleNested);
+
+    if (intersectArea > 0)
+    {
+        Scalar fillColor(200, 200, 200);
+        // If the input is invalid, draw the intersection in red
+        if (!isContourConvex(polygon1) || !isContourConvex(polygon2))
+        {
+            fillColor = Scalar(0, 0, 255);
+        }
+        vector<vector<Point> > pp;
+        pp.push_back(intersectionPolygon);
+        fillPoly(image, pp, fillColor);
+    }
+    polylines(image, polygons, true, Scalar(0, 0, 0));
+
+    return intersectArea;
+}
+
+static void drawDescription(Mat &image, int intersectionArea, string description, Point origin)
+{
+    const size_t bufSize=1024;
+    char caption[bufSize];
+    snprintf(caption, bufSize, "Intersection area: %d%s", intersectionArea, description.c_str());
+    putText(image, caption, origin, FONT_HERSHEY_SIMPLEX, 0.6, Scalar(0, 0, 0));
+}
+
+static void intersectConvexExample()
+{
+    Mat image(610, 550, CV_8UC3, Scalar(255, 255, 255));
+    float intersectionArea;
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 10), Point(50, 50)),
+        makeRectangle(Point(20, 20), Point(60, 60)));
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 40));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 70), Point(35, 95)),
+        makeRectangle(Point(35, 95), Point(60, 120)));
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 100));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 130), Point(60, 180)),
+        makeRectangle(Point(20, 140), Point(50, 170)),
+        true);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested true)", Point(70, 160));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 190), Point(60, 240)),
+        makeRectangle(Point(20, 200), Point(50, 230)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 220));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 250), Point(60, 300)),
+        makeRectangle(Point(20, 250), Point(50, 290)),
+        true);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested true)", Point(70, 280));
+
+    // These rectangles share an edge so handleNested can be false and an intersection is still found
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 310), Point(60, 360)),
+        makeRectangle(Point(20, 310), Point(50, 350)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 340));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 370), Point(60, 420)),
+        makeRectangle(Point(20, 371), Point(50, 410)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 400));
+
+    // A vertex of the triangle lies on an edge of the rectangle so handleNested can be false and an intersection is still found
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 430), Point(60, 480)),
+        makeTriangle(Point(35, 430), Point(20, 470), Point(50, 470)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 460));
+
+    // Show intersection of overlapping rectangle and triangle
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 490), Point(40, 540)),
+        makeTriangle(Point(25, 500), Point(25, 530), Point(60, 515)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 520));
+
+    // This concave polygon is invalid input to intersectConvexConvex so it returns an invalid intersection
+    vector<Point> notConvex;
+    notConvex.push_back(Point(25, 560));
+    notConvex.push_back(Point(25, 590));
+    notConvex.push_back(Point(45, 580));
+    notConvex.push_back(Point(60, 600));
+    notConvex.push_back(Point(60, 550));
+    notConvex.push_back(Point(45, 570));
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 550), Point(50, 600)),
+        notConvex,
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (invalid input: not convex)", Point(70, 580));
+
+    imshow("Intersections", image);
+    waitKey(0);
+}
+
+int main()
+{
+    intersectConvexExample();
+}
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@ -116,8 +116,10 @@ double compose_megapix = -1;
 float conf_thresh = 1.f;
 #ifdef HAVE_OPENCV_XFEATURES2D
 string features_type = "surf";
+float match_conf = 0.65f;
 #else
 string features_type = "orb";
+float match_conf = 0.3f;
 #endif
 string matcher_type = "homography";
 string estimator_type = "homography";
@ -132,7 +134,6 @@ int expos_comp_type = ExposureCompensator::GAIN_BLOCKS;
 int expos_comp_nr_feeds = 1;
 int expos_comp_nr_filtering = 2;
 int expos_comp_block_size = 32;
-float match_conf = 0.3f;
 string seam_find_type = "gc_color";
 int blend_type = Blender::MULTI_BAND;
 int timelapse_type = Timelapser::AS_IS;
@ -196,7 +197,7 @@ static int parseCmdArgs(int argc, char** argv)
        else if (string(argv[i]) == "--features")
        {
            features_type = argv[i + 1];
-            if (features_type == "orb")
+            if (string(features_type) == "orb")
                match_conf = 0.3f;
            i++;
        }
--- a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
@ -14,9 +14,9 @@ using namespace cv;

 const char* keys =
    "{ help  h| | Print help message. }"
-    "{ input1 | | Path to input image 1. }"
-    "{ input2 | | Path to input image 2. }"
-    "{ input3 | | Path to input image 3. }";
+    "{ @input1 | | Path to input image 1. }"
+    "{ @input2 | | Path to input image 2. }"
+    "{ @input3 | | Path to input image 3. }";

 /**
 * @function main
--- a/samples/dnn/fast_neural_style.py
+++ b/samples/dnn/fast_neural_style.py
@ -14,7 +14,7 @@ parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of
 args = parser.parse_args()

 net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
-net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV);
+net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)

 if args.input:
    cap = cv.VideoCapture(args.input)
--- a/samples/dnn/mobilenet_ssd_accuracy.py
+++ b/samples/dnn/mobilenet_ssd_accuracy.py
@ -27,7 +27,7 @@ args = parser.parse_args()

 ### Get OpenCV predictions #####################################################
 net = cv.dnn.readNetFromTensorflow(cv.samples.findFile(args.weights), cv.samples.findFile(args.prototxt))
-net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV);
+net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)

 detections = []
 for imgName in os.listdir(args.images):
--- a/samples/dnn/text_detection.py
+++ b/samples/dnn/text_detection.py
@ -134,7 +134,7 @@ def main():
            for j in range(4):
                p1 = (vertices[j][0], vertices[j][1])
                p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1])
-                cv.line(frame, p1, p2, (0, 255, 0), 1);
+                cv.line(frame, p1, p2, (0, 255, 0), 1)

        # Put efficiency information
        cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@ -21,7 +21,7 @@ def tokenize(s):
                elif token:
                    tokens.append(token)
                    token = ""
-            isString = (symbol == '\"' or symbol == '\'') ^ isString;
+            isString = (symbol == '\"' or symbol == '\'') ^ isString

        elif symbol == '{' or symbol == '}' or symbol == '[' or symbol == ']':
            if token:
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@ -122,7 +122,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
    print('Input image size: %dx%d' % (image_width, image_height))

    # Read the graph.
-    inpNames = ['image_tensor']
+    _inpNames = ['image_tensor']
    outNames = ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes']

    writeTextGraph(modelPath, outputPath, outNames)
--- a/samples/python/browse.py
+++ b/samples/python/browse.py
@ -45,7 +45,7 @@ def main():


    small = img
-    for i in xrange(3):
+    for _i in xrange(3):
        small = cv.pyrDown(small)

    def onmouse(event, x, y, flags, param):
--- a/samples/python/calibrate.py
+++ b/samples/python/calibrate.py
@ -97,7 +97,7 @@ def main():
        obj_points.append(pattern_points)

    # calculate camera distortion
-    rms, camera_matrix, dist_coefs, rvecs, tvecs = cv.calibrateCamera(obj_points, img_points, (w, h), None, None)
+    rms, camera_matrix, dist_coefs, _rvecs, _tvecs = cv.calibrateCamera(obj_points, img_points, (w, h), None, None)

    print("\nRMS:", rms)
    print("camera matrix:\n", camera_matrix)
@ -106,7 +106,7 @@ def main():
    # undistort the image with the calibration
    print('')
    for fn in img_names if debug_dir else []:
-        path, name, ext = splitfn(fn)
+        _path, name, _ext = splitfn(fn)
        img_found = os.path.join(debug_dir, name + '_chess.png')
        outfile = os.path.join(debug_dir, name + '_undistorted.png')

--- a/samples/python/camera_calibration_show_extrinsics.py
+++ b/samples/python/camera_calibration_show_extrinsics.py
@ -184,7 +184,7 @@ def main():
    extrinsics = fs.getNode('extrinsic_parameters').mat()

    import matplotlib.pyplot as plt
-    from mpl_toolkits.mplot3d import Axes3D
+    from mpl_toolkits.mplot3d import Axes3D  # pylint: disable=unused-variable

    fig = plt.figure()
    ax = fig.gca(projection='3d')
--- a/samples/python/color_histogram.py
+++ b/samples/python/color_histogram.py
@ -46,7 +46,7 @@ class App():
        cam = video.create_capture(fn, fallback='synth:bg=baboon.jpg:class=chess:noise=0.05')

        while True:
-            flag, frame = cam.read()
+            _flag, frame = cam.read()
            cv.imshow('camera', frame)

            small = cv.pyrDown(frame)
--- a/samples/python/edge.py
+++ b/samples/python/edge.py
@ -38,7 +38,7 @@ def main():

    cap = video.create_capture(fn)
    while True:
-        flag, img = cap.read()
+        _flag, img = cap.read()
        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        thrs1 = cv.getTrackbarPos('thrs1', 'edge')
        thrs2 = cv.getTrackbarPos('thrs2', 'edge')
--- a/samples/python/facedetect.py
+++ b/samples/python/facedetect.py
@ -48,7 +48,7 @@ def main():
    cam = create_capture(video_src, fallback='synth:bg={}:noise=0.05'.format(cv.samples.findFile('samples/data/lena.jpg')))

    while True:
-        ret, img = cam.read()
+        _ret, img = cam.read()
        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        gray = cv.equalizeHist(gray)

--- a/samples/python/fitline.py
+++ b/samples/python/fitline.py
@ -88,6 +88,7 @@ def main():
        update()
        ch = cv.waitKey(0)
        if ch == ord('f'):
+            global cur_func_name
            if PY3:
                cur_func_name = next(dist_func_names)
            else:
--- a/samples/python/houghcircles.py
+++ b/samples/python/houghcircles.py
@ -30,7 +30,7 @@ def main():
    circles = cv.HoughCircles(img, cv.HOUGH_GRADIENT, 1, 10, np.array([]), 100, 30, 1, 30)

    if circles is not None: # Check if circles have been found and only then iterate over these and add them to the image
-        a, b, c = circles.shape
+        _a, b, _c = circles.shape
        for i in range(b):
            cv.circle(cimg, (circles[0][i][0], circles[0][i][1]), circles[0][i][2], (0, 0, 255), 3, cv.LINE_AA)
            cv.circle(cimg, (circles[0][i][0], circles[0][i][1]), 2, (0, 255, 0), 3, cv.LINE_AA)  # draw center of circle
--- a/samples/python/houghlines.py
+++ b/samples/python/houghlines.py
@ -29,14 +29,14 @@ def main():

    if True: # HoughLinesP
        lines = cv.HoughLinesP(dst, 1, math.pi/180.0, 40, np.array([]), 50, 10)
-        a,b,c = lines.shape
+        a, b, _c = lines.shape
        for i in range(a):
            cv.line(cdst, (lines[i][0][0], lines[i][0][1]), (lines[i][0][2], lines[i][0][3]), (0, 0, 255), 3, cv.LINE_AA)

    else:    # HoughLines
        lines = cv.HoughLines(dst, 1, math.pi/180.0, 50, np.array([]), 0, 0)
        if lines is not None:
-            a,b,c = lines.shape
+            a, b, _c = lines.shape
            for i in range(a):
                rho = lines[i][0][0]
                theta = lines[i][0][1]
--- a/samples/python/kmeans.py
+++ b/samples/python/kmeans.py
@ -33,7 +33,7 @@ def main():
        points, _ = make_gaussians(cluster_n, img_size)

        term_crit = (cv.TERM_CRITERIA_EPS, 30, 0.1)
-        ret, labels, centers = cv.kmeans(points, cluster_n, None, term_crit, 10, 0)
+        _ret, labels, _centers = cv.kmeans(points, cluster_n, None, term_crit, 10, 0)

        img = np.zeros((img_size, img_size, 3), np.uint8)
        for (x, y), label in zip(np.int32(points), labels.ravel()):
--- a/samples/python/lappyr.py
+++ b/samples/python/lappyr.py
@ -60,7 +60,7 @@ def main():
        cv.createTrackbar('%d'%i, 'level control', 5, 50, nothing)

    while True:
-        ret, frame = cap.read()
+        _ret, frame = cap.read()

        pyr = build_lappyr(frame, leveln)
        for i in xrange(leveln):
--- a/samples/python/opt_flow.py
+++ b/samples/python/opt_flow.py
@ -64,14 +64,14 @@ def main():
        fn = 0

    cam = video.create_capture(fn)
-    ret, prev = cam.read()
+    _ret, prev = cam.read()
    prevgray = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
    show_hsv = False
    show_glitch = False
    cur_glitch = prev.copy()

    while True:
-        ret, img = cam.read()
+        _ret, img = cam.read()
        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        flow = cv.calcOpticalFlowFarneback(prevgray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        prevgray = gray
--- a/samples/python/peopledetect.py
+++ b/samples/python/peopledetect.py
@ -51,7 +51,7 @@ def main():
            print('loading error')
            continue

-        found, w = hog.detectMultiScale(img, winStride=(8,8), padding=(32,32), scale=1.05)
+        found, _w = hog.detectMultiScale(img, winStride=(8,8), padding=(32,32), scale=1.05)
        found_filtered = []
        for ri, r in enumerate(found):
            for qi, q in enumerate(found):
--- a/samples/python/stereo_match.py
+++ b/samples/python/stereo_match.py
@ -69,8 +69,8 @@ def main():
    out_points = points[mask]
    out_colors = colors[mask]
    out_fn = 'out.ply'
-    write_ply('out.ply', out_points, out_colors)
-    print('%s saved' % 'out.ply')
+    write_ply(out_fn, out_points, out_colors)
+    print('%s saved' % out_fn)

    cv.imshow('left', imgL)
    cv.imshow('disparity', (disp-min_disp)/num_disp)
--- a/samples/python/turing.py
+++ b/samples/python/turing.py
@ -32,7 +32,7 @@ def main():

    w, h = 512, 512

-    args, args_list = getopt.getopt(sys.argv[1:], 'o:', [])
+    args, _args_list = getopt.getopt(sys.argv[1:], 'o:', [])
    args = dict(args)
    out = None
    if '-o' in args:
--- a/samples/python/tutorial_code/core/mat_operations/mat_operations.py
+++ b/samples/python/tutorial_code/core/mat_operations/mat_operations.py
@ -25,13 +25,13 @@ def access_pixel():
    y = 0
    x = 0
    ## [Pixel access 1]
-    intensity = img[y,x]
+    _intensity = img[y,x]
    ## [Pixel access 1]

    ## [Pixel access 3]
-    blue = img[y,x,0]
-    green = img[y,x,1]
-    red = img[y,x,2]
+    _blue = img[y,x,0]
+    _green = img[y,x,1]
+    _red = img[y,x,2]
    ## [Pixel access 3]

    ## [Pixel access 5]
@ -42,12 +42,12 @@ def reference_counting():
    # Memory management and reference counting
    ## [Reference counting 2]
    img = cv.imread('image.jpg')
-    img1 = np.copy(img)
+    _img1 = np.copy(img)
    ## [Reference counting 2]

    ## [Reference counting 3]
    img = cv.imread('image.jpg')
-    sobelx = cv.Sobel(img, cv.CV_32F, 1, 0);
+    _sobelx = cv.Sobel(img, cv.CV_32F, 1, 0)
    ## [Reference counting 3]

 def primitive_operations():
@ -57,17 +57,17 @@ def primitive_operations():
    ## [Set image to black]

    ## [Select ROI]
-    smallImg = img[10:110,10:110]
+    _smallImg = img[10:110,10:110]
    ## [Select ROI]

    ## [BGR to Gray]
    img = cv.imread('image.jpg')
-    grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    _grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    ## [BGR to Gray]

    src = np.ones((4,4), np.uint8)
    ## [Convert to CV_32F]
-    dst = src.astype(np.float32)
+    _dst = src.astype(np.float32)
    ## [Convert to CV_32F]

 def visualize_images():
--- a/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
+++ b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
@ -25,8 +25,8 @@ def gammaCorrection():
    res = cv.LUT(img_original, lookUpTable)
    ## [changing-contrast-brightness-gamma-correction]

-    img_gamma_corrected = cv.hconcat([img_original, res]);
-    cv.imshow("Gamma correction", img_gamma_corrected);
+    img_gamma_corrected = cv.hconcat([img_original, res])
+    cv.imshow("Gamma correction", img_gamma_corrected)

 def on_linear_transform_alpha_trackbar(val):
    global alpha
--- a/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
+++ b/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
@ -85,13 +85,13 @@ contours, _ = cv.findContours(bw, cv.RETR_LIST, cv.CHAIN_APPROX_NONE)

 for i, c in enumerate(contours):
    # Calculate the area of each contour
-    area = cv.contourArea(c);
+    area = cv.contourArea(c)
    # Ignore contours that are too small or too large
    if area < 1e2 or 1e5 < area:
        continue

    # Draw each contour only for visualisation purposes
-    cv.drawContours(src, contours, i, (0, 0, 255), 2);
+    cv.drawContours(src, contours, i, (0, 0, 255), 2)
    # Find the orientation of each shape
    getOrientation(c, src)
 ## [contours]
--- a/samples/python/video_threaded.py
+++ b/samples/python/video_threaded.py
@ -70,7 +70,7 @@ def main():
            draw_str(res, (20, 60), "frame interval :  %.1f ms" % (frame_interval.value*1000))
            cv.imshow('threaded video', res)
        if len(pending) < threadn:
-            ret, frame = cap.read()
+            _ret, frame = cap.read()
            t = clock()
            frame_interval.update(t - last_frame_time)
            last_frame_time = t
--- a/samples/python/video_v4l2.py
+++ b/samples/python/video_v4l2.py
@ -42,7 +42,7 @@ def main():
    cv.createTrackbar("Focus", "Video", focus, 100, lambda v: cap.set(cv.CAP_PROP_FOCUS, v / 100))

    while True:
-        status, img = cap.read()
+        _status, img = cap.read()

        fourcc = decode_fourcc(cap.get(cv.CAP_PROP_FOURCC))