Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-07-29 00:33:40 +08:00 · 2019-05-15 18:01:21 +00:00 · 2019-05-15 18:01:21 +00:00 · e21262deba
commit e21262deba
parent b2abd8ca41 84fd8190f3
54 changed files with 1545 additions and 748 deletions
--- a/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown
+++ b/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown
@ -1,225 +1,4 @@
 Optical Flow {#tutorial_py_lucas_kanade}
 ============

-Goal
----
-
-In this chapter,
-    -   We will understand the concepts of optical flow and its estimation using Lucas-Kanade
-        method.
-    -   We will use functions like **cv.calcOpticalFlowPyrLK()** to track feature points in a
-        video.
-
-Optical Flow
------------
-
-Optical flow is the pattern of apparent motion of image objects between two consecutive frames
-caused by the movemement of object or camera. It is 2D vector field where each vector is a
-displacement vector showing the movement of points from first frame to second. Consider the image
-below (Image Courtesy: [Wikipedia article on Optical
-Flow](http://en.wikipedia.org/wiki/Optical_flow)).
-
-![image](images/optical_flow_basic1.jpg)
-
-It shows a ball moving in 5 consecutive frames. The arrow shows its displacement vector. Optical
-flow has many applications in areas like :
-
-   Structure from Motion
-   Video Compression
-   Video Stabilization ...
-
-Optical flow works on several assumptions:
-
-#  The pixel intensities of an object do not change between consecutive frames.
-2.  Neighbouring pixels have similar motion.
-
-Consider a pixel \f$I(x,y,t)\f$ in first frame (Check a new dimension, time, is added here. Earlier we
-were working with images only, so no need of time). It moves by distance \f$(dx,dy)\f$ in next frame
-taken after \f$dt\f$ time. So since those pixels are the same and intensity does not change, we can say,
-
-\f[I(x,y,t) = I(x+dx, y+dy, t+dt)\f]
-
-Then take taylor series approximation of right-hand side, remove common terms and divide by \f$dt\f$ to
-get the following equation:
-
-\f[f_x u + f_y v + f_t = 0 \;\f]
-
-where:
-
-\f[f_x = \frac{\partial f}{\partial x} \; ; \; f_y = \frac{\partial f}{\partial y}\f]\f[u = \frac{dx}{dt} \; ; \; v = \frac{dy}{dt}\f]
-
-Above equation is called Optical Flow equation. In it, we can find \f$f_x\f$ and \f$f_y\f$, they are image
-gradients. Similarly \f$f_t\f$ is the gradient along time. But \f$(u,v)\f$ is unknown. We cannot solve this
-one equation with two unknown variables. So several methods are provided to solve this problem and
-one of them is Lucas-Kanade.
-
-### Lucas-Kanade method
-
-We have seen an assumption before, that all the neighbouring pixels will have similar motion.
-Lucas-Kanade method takes a 3x3 patch around the point. So all the 9 points have the same motion. We
-can find \f$(f_x, f_y, f_t)\f$ for these 9 points. So now our problem becomes solving 9 equations with
-two unknown variables which is over-determined. A better solution is obtained with least square fit
-method. Below is the final solution which is two equation-two unknown problem and solve to get the
-solution.
-
-\f[\begin{bmatrix} u \\ v \end{bmatrix} =
-\begin{bmatrix}
-    \sum_{i}{f_{x_i}}^2  &  \sum_{i}{f_{x_i} f_{y_i} } \\
-    \sum_{i}{f_{x_i} f_{y_i}} & \sum_{i}{f_{y_i}}^2
-\end{bmatrix}^{-1}
-\begin{bmatrix}
-    - \sum_{i}{f_{x_i} f_{t_i}} \\
-    - \sum_{i}{f_{y_i} f_{t_i}}
-\end{bmatrix}\f]
-
-( Check similarity of inverse matrix with Harris corner detector. It denotes that corners are better
-points to be tracked.)
-
-So from the user point of view, the idea is simple, we give some points to track, we receive the optical
-flow vectors of those points. But again there are some problems. Until now, we were dealing with
-small motions, so it fails when there is a large motion. To deal with this we use pyramids. When we go up in
-the pyramid, small motions are removed and large motions become small motions. So by applying
-Lucas-Kanade there, we get optical flow along with the scale.
-
-Lucas-Kanade Optical Flow in OpenCV
-----------------------------------
-
-OpenCV provides all these in a single function, **cv.calcOpticalFlowPyrLK()**. Here, we create a
-simple application which tracks some points in a video. To decide the points, we use
-**cv.goodFeaturesToTrack()**. We take the first frame, detect some Shi-Tomasi corner points in it,
-then we iteratively track those points using Lucas-Kanade optical flow. For the function
-**cv.calcOpticalFlowPyrLK()** we pass the previous frame, previous points and next frame. It
-returns next points along with some status numbers which has a value of 1 if next point is found,
-else zero. We iteratively pass these next points as previous points in next step. See the code
-below:
-@code{.py}
-import numpy as np
-import cv2 as cv
-
-cap = cv.VideoCapture('slow.flv')
-
-# params for ShiTomasi corner detection
-feature_params = dict( maxCorners = 100,
-                       qualityLevel = 0.3,
-                       minDistance = 7,
-                       blockSize = 7 )
-
-# Parameters for lucas kanade optical flow
-lk_params = dict( winSize  = (15,15),
-                  maxLevel = 2,
-                  criteria = (cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 0.03))
-
-# Create some random colors
-color = np.random.randint(0,255,(100,3))
-
-# Take first frame and find corners in it
-ret, old_frame = cap.read()
-old_gray = cv.cvtColor(old_frame, cv.COLOR_BGR2GRAY)
-p0 = cv.goodFeaturesToTrack(old_gray, mask = None, **feature_params)
-
-# Create a mask image for drawing purposes
-mask = np.zeros_like(old_frame)
-
-while(1):
-    ret,frame = cap.read()
-    frame_gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
-
-    # calculate optical flow
-    p1, st, err = cv.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params)
-
-    # Select good points
-    good_new = p1[st==1]
-    good_old = p0[st==1]
-
-    # draw the tracks
-    for i,(new,old) in enumerate(zip(good_new,good_old)):
-        a,b = new.ravel()
-        c,d = old.ravel()
-        mask = cv.line(mask, (a,b),(c,d), color[i].tolist(), 2)
-        frame = cv.circle(frame,(a,b),5,color[i].tolist(),-1)
-    img = cv.add(frame,mask)
-
-    cv.imshow('frame',img)
-    k = cv.waitKey(30) & 0xff
-    if k == 27:
-        break
-
-    # Now update the previous frame and previous points
-    old_gray = frame_gray.copy()
-    p0 = good_new.reshape(-1,1,2)
-
-cv.destroyAllWindows()
-cap.release()
-@endcode
-(This code doesn't check how correct are the next keypoints. So even if any feature point disappears
-in image, there is a chance that optical flow finds the next point which may look close to it. So
-actually for a robust tracking, corner points should be detected in particular intervals. OpenCV
-samples comes up with such a sample which finds the feature points at every 5 frames. It also run a
-backward-check of the optical flow points got to select only good ones. Check
-samples/python/lk_track.py).
-
-See the results we got:
-
-![image](images/opticalflow_lk.jpg)
-
-Dense Optical Flow in OpenCV
----------------------------
-
-Lucas-Kanade method computes optical flow for a sparse feature set (in our example, corners detected
-using Shi-Tomasi algorithm). OpenCV provides another algorithm to find the dense optical flow. It
-computes the optical flow for all the points in the frame. It is based on Gunner Farneback's
-algorithm which is explained in "Two-Frame Motion Estimation Based on Polynomial Expansion" by
-Gunner Farneback in 2003.
-
-Below sample shows how to find the dense optical flow using above algorithm. We get a 2-channel
-array with optical flow vectors, \f$(u,v)\f$. We find their magnitude and direction. We color code the
-result for better visualization. Direction corresponds to Hue value of the image. Magnitude
-corresponds to Value plane. See the code below:
-@code{.py}
-import cv2 as cv
-import numpy as np
-cap = cv.VideoCapture("vtest.avi")
-
-ret, frame1 = cap.read()
-prvs = cv.cvtColor(frame1,cv.COLOR_BGR2GRAY)
-hsv = np.zeros_like(frame1)
-hsv[...,1] = 255
-
-while(1):
-    ret, frame2 = cap.read()
-    next = cv.cvtColor(frame2,cv.COLOR_BGR2GRAY)
-
-    flow = cv.calcOpticalFlowFarneback(prvs,next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
-
-    mag, ang = cv.cartToPolar(flow[...,0], flow[...,1])
-    hsv[...,0] = ang*180/np.pi/2
-    hsv[...,2] = cv.normalize(mag,None,0,255,cv.NORM_MINMAX)
-    bgr = cv.cvtColor(hsv,cv.COLOR_HSV2BGR)
-
-    cv.imshow('frame2',bgr)
-    k = cv.waitKey(30) & 0xff
-    if k == 27:
-        break
-    elif k == ord('s'):
-        cv.imwrite('opticalfb.png',frame2)
-        cv.imwrite('opticalhsv.png',bgr)
-    prvs = next
-
-cap.release()
-cv.destroyAllWindows()
-@endcode
-See the result below:
-
-![image](images/opticalfb.jpg)
-
-OpenCV comes with a more advanced sample on dense optical flow, please see
-samples/python/opt_flow.py.
-
-Additional Resources
--------------------
-
-Exercises
---------
-
-#  Check the code in samples/python/lk_track.py. Try to understand the code.
-2.  Check the code in samples/python/opt_flow.py. Try to understand the code.
+Tutorial content has been moved: @ref tutorial_optical_flow
--- a/doc/py_tutorials/py_video/py_meanshift/py_meanshift.markdown
+++ b/doc/py_tutorials/py_video/py_meanshift/py_meanshift.markdown
@ -1,185 +1,4 @@
 Meanshift and Camshift {#tutorial_py_meanshift}
 ======================

-Goal
----
-
-In this chapter,
-
-   We will learn about Meanshift and Camshift algorithms to find and track objects in videos.
-
-Meanshift
---------
-
-The intuition behind the meanshift is simple. Consider you have a set of points. (It can be a pixel
-distribution like histogram backprojection). You are given a small window ( may be a circle) and you
-have to move that window to the area of maximum pixel density (or maximum number of points). It is
-illustrated in the simple image given below:
-
-![image](images/meanshift_basics.jpg)
-
-The initial window is shown in blue circle with the name "C1". Its original center is marked in blue
-rectangle, named "C1_o". But if you find the centroid of the points inside that window, you will
-get the point "C1_r" (marked in small blue circle) which is the real centroid of window. Surely
-they don't match. So move your window such that circle of the new window matches with previous
-centroid. Again find the new centroid. Most probably, it won't match. So move it again, and continue
-the iterations such that center of window and its centroid falls on the same location (or with a
-small desired error). So finally what you obtain is a window with maximum pixel distribution. It is
-marked with green circle, named "C2". As you can see in image, it has maximum number of points. The
-whole process is demonstrated on a static image below:
-
-![image](images/meanshift_face.gif)
-
-So we normally pass the histogram backprojected image and initial target location. When the object
-moves, obviously the movement is reflected in histogram backprojected image. As a result, meanshift
-algorithm moves our window to the new location with maximum density.
-
-### Meanshift in OpenCV
-
-To use meanshift in OpenCV, first we need to setup the target, find its histogram so that we can
-backproject the target on each frame for calculation of meanshift. We also need to provide initial
-location of window. For histogram, only Hue is considered here. Also, to avoid false values due to
-low light, low light values are discarded using **cv.inRange()** function.
-@code{.py}
-import numpy as np
-import cv2 as cv
-
-cap = cv.VideoCapture('slow.flv')
-
-# take first frame of the video
-ret,frame = cap.read()
-
-# setup initial location of window
-r,h,c,w = 250,90,400,125  # simply hardcoded the values
-track_window = (c,r,w,h)
-
-# set up the ROI for tracking
-roi = frame[r:r+h, c:c+w]
-hsv_roi =  cv.cvtColor(roi, cv.COLOR_BGR2HSV)
-mask = cv.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
-roi_hist = cv.calcHist([hsv_roi],[0],mask,[180],[0,180])
-cv.normalize(roi_hist,roi_hist,0,255,cv.NORM_MINMAX)
-
-# Setup the termination criteria, either 10 iteration or move by atleast 1 pt
-term_crit = ( cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 1 )
-
-while(1):
-    ret ,frame = cap.read()
-
-    if ret == True:
-        hsv = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
-        dst = cv.calcBackProject([hsv],[0],roi_hist,[0,180],1)
-
-        # apply meanshift to get the new location
-        ret, track_window = cv.meanShift(dst, track_window, term_crit)
-
-        # Draw it on image
-        x,y,w,h = track_window
-        img2 = cv.rectangle(frame, (x,y), (x+w,y+h), 255,2)
-        cv.imshow('img2',img2)
-
-        k = cv.waitKey(60) & 0xff
-        if k == 27:
-            break
-        else:
-            cv.imwrite(chr(k)+".jpg",img2)
-
-    else:
-        break
-
-cv.destroyAllWindows()
-cap.release()
-@endcode
-Three frames in a video I used is given below:
-
-![image](images/meanshift_result.jpg)
-
-Camshift
--------
-
-Did you closely watch the last result? There is a problem. Our window always has the same size when
-car is farther away and it is very close to camera. That is not good. We need to adapt the window
-size with size and rotation of the target. Once again, the solution came from "OpenCV Labs" and it
-is called CAMshift (Continuously Adaptive Meanshift) published by Gary Bradsky in his paper
-"Computer Vision Face Tracking for Use in a Perceptual User Interface" in 1998.
-
-It applies meanshift first. Once meanshift converges, it updates the size of the window as,
-\f$s = 2 \times \sqrt{\frac{M_{00}}{256}}\f$. It also calculates the orientation of best fitting ellipse
-to it. Again it applies the meanshift with new scaled search window and previous window location.
-The process is continued until required accuracy is met.
-
-![image](images/camshift_face.gif)
-
-### Camshift in OpenCV
-
-It is almost same as meanshift, but it returns a rotated rectangle (that is our result) and box
-parameters (used to be passed as search window in next iteration). See the code below:
-@code{.py}
-import numpy as np
-import cv2 as cv
-
-cap = cv.VideoCapture('slow.flv')
-
-# take first frame of the video
-ret,frame = cap.read()
-
-# setup initial location of window
-r,h,c,w = 250,90,400,125  # simply hardcoded the values
-track_window = (c,r,w,h)
-
-# set up the ROI for tracking
-roi = frame[r:r+h, c:c+w]
-hsv_roi =  cv.cvtColor(roi, cv.COLOR_BGR2HSV)
-mask = cv.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
-roi_hist = cv.calcHist([hsv_roi],[0],mask,[180],[0,180])
-cv.normalize(roi_hist,roi_hist,0,255,cv.NORM_MINMAX)
-
-# Setup the termination criteria, either 10 iteration or move by atleast 1 pt
-term_crit = ( cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 1 )
-
-while(1):
-    ret ,frame = cap.read()
-
-    if ret == True:
-        hsv = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
-        dst = cv.calcBackProject([hsv],[0],roi_hist,[0,180],1)
-
-        # apply meanshift to get the new location
-        ret, track_window = cv.CamShift(dst, track_window, term_crit)
-
-        # Draw it on image
-        pts = cv.boxPoints(ret)
-        pts = np.int0(pts)
-        img2 = cv.polylines(frame,[pts],True, 255,2)
-        cv.imshow('img2',img2)
-
-        k = cv.waitKey(60) & 0xff
-        if k == 27:
-            break
-        else:
-            cv.imwrite(chr(k)+".jpg",img2)
-
-    else:
-        break
-
-cv.destroyAllWindows()
-cap.release()
-@endcode
-Three frames of the result is shown below:
-
-![image](images/camshift_result.jpg)
-
-Additional Resources
--------------------
-
-#  French Wikipedia page on [Camshift](http://fr.wikipedia.org/wiki/Camshift). (The two animations
-    are taken from here)
-2.  Bradski, G.R., "Real time face and object tracking as a component of a perceptual user
-    interface," Applications of Computer Vision, 1998. WACV '98. Proceedings., Fourth IEEE Workshop
-    on , vol., no., pp.214,219, 19-21 Oct 1998
-
-Exercises
---------
-
-#  OpenCV comes with a Python sample on interactive demo of camshift. Use it, hack it, understand
-    it.
+Tutorial content has been moved: @ref tutorial_meanshift
--- a/doc/py_tutorials/py_video/py_table_of_contents_video.markdown
+++ b/doc/py_tutorials/py_video/py_table_of_contents_video.markdown
@ -1,13 +1,13 @@
 Video Analysis {#tutorial_py_table_of_contents_video}
 ==============

-   @subpage tutorial_py_meanshift
+-   @ref tutorial_meanshift

    We have already seen
    an example of color-based tracking. It is simpler. This time, we see significantly better
    algorithms like "Meanshift", and its upgraded version, "Camshift" to find and track them.

-   @subpage tutorial_py_lucas_kanade
+-   @ref tutorial_optical_flow

    Now let's discuss an important concept, "Optical Flow", which is related to videos and has many applications.

--- a/doc/py_tutorials/py_video/py_meanshift/images/camshift_face.gif
+++ b/doc/py_tutorials/py_video/py_meanshift/images/camshift_face.gif
--- a/doc/py_tutorials/py_video/py_meanshift/images/camshift_result.jpg
+++ b/doc/py_tutorials/py_video/py_meanshift/images/camshift_result.jpg
--- a/doc/py_tutorials/py_video/py_meanshift/images/meanshift_basics.jpg
+++ b/doc/py_tutorials/py_video/py_meanshift/images/meanshift_basics.jpg
--- a/doc/py_tutorials/py_video/py_meanshift/images/meanshift_face.gif
+++ b/doc/py_tutorials/py_video/py_meanshift/images/meanshift_face.gif
--- a/doc/py_tutorials/py_video/py_meanshift/images/meanshift_result.jpg
+++ b/doc/py_tutorials/py_video/py_meanshift/images/meanshift_result.jpg
--- a/doc/tutorials/video/meanshift/meanshift.markdown
+++ b/doc/tutorials/video/meanshift/meanshift.markdown
@ -0,0 +1,118 @@
+Meanshift and Camshift {#tutorial_meanshift}
+======================
+
+Goal
+----
+
+In this chapter,
+
+-   We will learn about the Meanshift and Camshift algorithms to track objects in videos.
+
+Meanshift
+---------
+
+The intuition behind the meanshift is simple. Consider you have a set of points. (It can be a pixel
+distribution like histogram backprojection). You are given a small window (may be a circle) and you
+have to move that window to the area of maximum pixel density (or maximum number of points). It is
+illustrated in the simple image given below:
+
+![image](images/meanshift_basics.jpg)
+
+The initial window is shown in blue circle with the name "C1". Its original center is marked in blue
+rectangle, named "C1_o". But if you find the centroid of the points inside that window, you will
+get the point "C1_r" (marked in small blue circle) which is the real centroid of the window. Surely
+they don't match. So move your window such that the circle of the new window matches with the previous
+centroid. Again find the new centroid. Most probably, it won't match. So move it again, and continue
+the iterations such that the center of window and its centroid falls on the same location (or within a
+small desired error). So finally what you obtain is a window with maximum pixel distribution. It is
+marked with a green circle, named "C2". As you can see in the image, it has maximum number of points. The
+whole process is demonstrated on a static image below:
+
+![image](images/meanshift_face.gif)
+
+So we normally pass the histogram backprojected image and initial target location. When the object
+moves, obviously the movement is reflected in the histogram backprojected image. As a result, the meanshift
+algorithm moves our window to the new location with maximum density.
+
+### Meanshift in OpenCV
+
+To use meanshift in OpenCV, first we need to setup the target, find its histogram so that we can
+backproject the target on each frame for calculation of meanshift. We also need to provide an initial
+location of window. For histogram, only Hue is considered here. Also, to avoid false values due to
+low light, low light values are discarded using **cv.inRange()** function.
+
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/video/meanshift/meanshift.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/video/meanshift/meanshift.cpp
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/video/meanshift/meanshift.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/video/meanshift/meanshift.py
+@end_toggle
+
+Three frames in a video I used is given below:
+
+![image](images/meanshift_result.jpg)
+
+Camshift
+--------
+
+Did you closely watch the last result? There is a problem. Our window always has the same size whether
+the car is very far or very close to the camera. That is not good. We need to adapt the window
+size with size and rotation of the target. Once again, the solution came from "OpenCV Labs" and it
+is called CAMshift (Continuously Adaptive Meanshift) published by Gary Bradsky in his paper
+"Computer Vision Face Tracking for Use in a Perceptual User Interface" in 1998 @cite Bradski98 .
+
+It applies meanshift first. Once meanshift converges, it updates the size of the window as,
+\f$s = 2 \times \sqrt{\frac{M_{00}}{256}}\f$. It also calculates the orientation of the best fitting ellipse
+to it. Again it applies the meanshift with new scaled search window and previous window location.
+The process continues until the required accuracy is met.
+
+![image](images/camshift_face.gif)
+
+### Camshift in OpenCV
+
+It is similar to meanshift, but returns a rotated rectangle (that is our result) and box
+parameters (used to be passed as search window in next iteration). See the code below:
+
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/video/meanshift/camshift.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/video/meanshift/camshift.cpp
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/video/meanshift/camshift.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/video/meanshift/camshift.py
+@end_toggle
+
+Three frames of the result is shown below:
+
+![image](images/camshift_result.jpg)
+
+Additional Resources
+--------------------
+
+-#  French Wikipedia page on [Camshift](http://fr.wikipedia.org/wiki/Camshift). (The two animations
+    are taken from there)
+2.  Bradski, G.R., "Real time face and object tracking as a component of a perceptual user
+    interface," Applications of Computer Vision, 1998. WACV '98. Proceedings., Fourth IEEE Workshop
+    on , vol., no., pp.214,219, 19-21 Oct 1998
+
+Exercises
+---------
+
+-#  OpenCV comes with a Python [sample](https://github.com/opencv/opencv/blob/master/samples/python/camshift.py) for an interactive demo of camshift. Use it, hack it, understand
+    it.
--- a/doc/py_tutorials/py_video/py_lucas_kanade/images/optical_flow_basic1.jpg
+++ b/doc/py_tutorials/py_video/py_lucas_kanade/images/optical_flow_basic1.jpg
--- a/doc/py_tutorials/py_video/py_lucas_kanade/images/opticalfb.jpg
+++ b/doc/py_tutorials/py_video/py_lucas_kanade/images/opticalfb.jpg
--- a/doc/py_tutorials/py_video/py_lucas_kanade/images/opticalflow_lk.jpg
+++ b/doc/py_tutorials/py_video/py_lucas_kanade/images/opticalflow_lk.jpg
--- a/doc/tutorials/video/optical_flow/optical_flow.markdown
+++ b/doc/tutorials/video/optical_flow/optical_flow.markdown
@ -0,0 +1,156 @@
+Optical Flow {#tutorial_optical_flow}
+============
+
+Goal
+----
+
+In this chapter,
+    -   We will understand the concepts of optical flow and its estimation using Lucas-Kanade
+        method.
+    -   We will use functions like **cv.calcOpticalFlowPyrLK()** to track feature points in a
+        video.
+    -   We will create a dense optical flow field using the **cv.calcOpticalFlowFarneback()** method.
+
+Optical Flow
+------------
+
+Optical flow is the pattern of apparent motion of image objects between two consecutive frames
+caused by the movemement of object or camera. It is 2D vector field where each vector is a
+displacement vector showing the movement of points from first frame to second. Consider the image
+below (Image Courtesy: [Wikipedia article on Optical Flow](http://en.wikipedia.org/wiki/Optical_flow)).
+
+![image](images/optical_flow_basic1.jpg)
+
+It shows a ball moving in 5 consecutive frames. The arrow shows its displacement vector. Optical
+flow has many applications in areas like :
+
+-   Structure from Motion
+-   Video Compression
+-   Video Stabilization ...
+
+Optical flow works on several assumptions:
+
+-#  The pixel intensities of an object do not change between consecutive frames.
+2.  Neighbouring pixels have similar motion.
+
+Consider a pixel \f$I(x,y,t)\f$ in first frame (Check a new dimension, time, is added here. Earlier we
+were working with images only, so no need of time). It moves by distance \f$(dx,dy)\f$ in next frame
+taken after \f$dt\f$ time. So since those pixels are the same and intensity does not change, we can say,
+
+\f[I(x,y,t) = I(x+dx, y+dy, t+dt)\f]
+
+Then take taylor series approximation of right-hand side, remove common terms and divide by \f$dt\f$ to
+get the following equation:
+
+\f[f_x u + f_y v + f_t = 0 \;\f]
+
+where:
+
+\f[f_x = \frac{\partial f}{\partial x} \; ; \; f_y = \frac{\partial f}{\partial y}\f]\f[u = \frac{dx}{dt} \; ; \; v = \frac{dy}{dt}\f]
+
+Above equation is called Optical Flow equation. In it, we can find \f$f_x\f$ and \f$f_y\f$, they are image
+gradients. Similarly \f$f_t\f$ is the gradient along time. But \f$(u,v)\f$ is unknown. We cannot solve this
+one equation with two unknown variables. So several methods are provided to solve this problem and
+one of them is Lucas-Kanade.
+
+### Lucas-Kanade method
+
+We have seen an assumption before, that all the neighbouring pixels will have similar motion.
+Lucas-Kanade method takes a 3x3 patch around the point. So all the 9 points have the same motion. We
+can find \f$(f_x, f_y, f_t)\f$ for these 9 points. So now our problem becomes solving 9 equations with
+two unknown variables which is over-determined. A better solution is obtained with least square fit
+method. Below is the final solution which is two equation-two unknown problem and solve to get the
+solution.
+
+\f[\begin{bmatrix} u \\ v \end{bmatrix} =
+\begin{bmatrix}
+    \sum_{i}{f_{x_i}}^2  &  \sum_{i}{f_{x_i} f_{y_i} } \\
+    \sum_{i}{f_{x_i} f_{y_i}} & \sum_{i}{f_{y_i}}^2
+\end{bmatrix}^{-1}
+\begin{bmatrix}
+    - \sum_{i}{f_{x_i} f_{t_i}} \\
+    - \sum_{i}{f_{y_i} f_{t_i}}
+\end{bmatrix}\f]
+
+( Check similarity of inverse matrix with Harris corner detector. It denotes that corners are better
+points to be tracked.)
+
+So from the user point of view, the idea is simple, we give some points to track, we receive the optical
+flow vectors of those points. But again there are some problems. Until now, we were dealing with
+small motions, so it fails when there is a large motion. To deal with this we use pyramids. When we go up in
+the pyramid, small motions are removed and large motions become small motions. So by applying
+Lucas-Kanade there, we get optical flow along with the scale.
+
+Lucas-Kanade Optical Flow in OpenCV
+-----------------------------------
+
+OpenCV provides all these in a single function, **cv.calcOpticalFlowPyrLK()**. Here, we create a
+simple application which tracks some points in a video. To decide the points, we use
+**cv.goodFeaturesToTrack()**. We take the first frame, detect some Shi-Tomasi corner points in it,
+then we iteratively track those points using Lucas-Kanade optical flow. For the function
+**cv.calcOpticalFlowPyrLK()** we pass the previous frame, previous points and next frame. It
+returns next points along with some status numbers which has a value of 1 if next point is found,
+else zero. We iteratively pass these next points as previous points in next step. See the code
+below:
+
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/video/optical_flow/optical_flow.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/video/optical_flow/optical_flow.cpp
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/video/optical_flow/optical_flow.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/video/optical_flow/optical_flow.py
+@end_toggle
+
+(This code doesn't check how correct are the next keypoints. So even if any feature point disappears
+in image, there is a chance that optical flow finds the next point which may look close to it. So
+actually for a robust tracking, corner points should be detected in particular intervals. OpenCV
+samples comes up with such a sample which finds the feature points at every 5 frames. It also run a
+backward-check of the optical flow points got to select only good ones. Check
+samples/python/lk_track.py).
+
+See the results we got:
+
+![image](images/opticalflow_lk.jpg)
+
+Dense Optical Flow in OpenCV
+----------------------------
+
+Lucas-Kanade method computes optical flow for a sparse feature set (in our example, corners detected
+using Shi-Tomasi algorithm). OpenCV provides another algorithm to find the dense optical flow. It
+computes the optical flow for all the points in the frame. It is based on Gunner Farneback's
+algorithm which is explained in "Two-Frame Motion Estimation Based on Polynomial Expansion" by
+Gunner Farneback in 2003.
+
+Below sample shows how to find the dense optical flow using above algorithm. We get a 2-channel
+array with optical flow vectors, \f$(u,v)\f$. We find their magnitude and direction. We color code the
+result for better visualization. Direction corresponds to Hue value of the image. Magnitude
+corresponds to Value plane. See the code below:
+
+@add_toggle_cpp
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/video/optical_flow/optical_flow_dense.cpp)
+
+-   **Code at glance:**
+    @include samples/cpp/tutorial_code/video/optical_flow/optical_flow_dense.cpp
+@end_toggle
+
+@add_toggle_python
+-   **Downloadable code**: Click
+    [here](https://github.com/opencv/opencv/tree/master/samples/python/tutorial_code/video/optical_flow/optical_flow_dense.py)
+
+-   **Code at glance:**
+    @include samples/python/tutorial_code/video/optical_flow/optical_flow_dense.py
+@end_toggle
+
+
+See the result below:
+
+![image](images/opticalfb.jpg)
--- a/doc/tutorials/video/table_of_content_video.markdown
+++ b/doc/tutorials/video/table_of_content_video.markdown
@ -14,3 +14,15 @@ tracking and foreground extractions.

    We will learn how to extract foreground masks from both videos and sequences of images and
    to show them.
+
+-   @subpage tutorial_meanshift
+
+    *Languages:* C++, Python
+
+    Learn how to use the Meanshift and Camshift algorithms to track objects in videos.
+
+-   @subpage tutorial_optical_flow
+
+    *Languages:* C++, Python
+
+    We will learn how to use optical flow methods to track sparse features or to create a dense representation.
--- a/modules/calib3d/src/levmarq.cpp
+++ b/modules/calib3d/src/levmarq.cpp
@ -80,11 +80,8 @@ namespace cv
 class LMSolverImpl CV_FINAL : public LMSolver
 {
 public:
-    LMSolverImpl() : maxIters(100) { init(); }
-    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters) : cb(_cb), epsx(FLT_EPSILON), epsf(FLT_EPSILON), maxIters(_maxIters) { init(); }
-    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters, double _eps) : cb(_cb), epsx(_eps), epsf(_eps), maxIters(_maxIters) { init(); }
-
-    void init()
+    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters, double _eps = FLT_EPSILON)
+        : cb(_cb), epsx(_eps), epsf(_eps), maxIters(_maxIters)
    {
        printInterval = 0;
    }
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@ -1130,7 +1130,7 @@ public:
            CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );

        if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 )
-            CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisble by 16" );
+            CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisible by 16" );

        if( params.textureThreshold < 0 )
            CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" );
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd
 ////////// Reduce and mask /////////

 /** Reduce **/
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
+}
+#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, _mm_srli_si128(val,8)); \
+        val = intrin(val, _mm_srli_si128(val,4)); \
+        val = intrin(val, _mm_srli_si128(val,2)); \
+        val = intrin(val, _mm_srli_si128(val,1)); \
+        return (sctype)_mm_cvtsi128_si32(val); \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, min, _mm_min_epi8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, max, _mm_max_epi8)
+
 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
    inline sctype v_reduce_##func(const _Tpvec& a)                  \
    {                                                               \
@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)

-inline ushort v_reduce_sum(const v_uint16x16& a)
-{
-    __m128i a0 = _v256_extract_low(a.val);
-    __m128i a1 = _v256_extract_high(a.val);
-
-    __m128i s0 = _mm_adds_epu16(a0, a1);
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
-
-    return (ushort)_mm_cvtsi128_si32(s0);
-}
-
-inline short v_reduce_sum(const v_int16x16& a)
-{
-    __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
-            s0 = _mm256_hadds_epi16(s0, s0);
-            s0 = _mm256_hadds_epi16(s0, s0);
-
-    __m128i s1 = _v256_extract_high(s0);
-            s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
-
-    return (short)_mm_cvtsi128_si32(s1);
-}
-
 inline int v_reduce_sum(const v_int32x8& a)
 {
    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a)
 inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }

+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
 inline float v_reduce_sum(const v_float32x8& a)
 {
    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a)
    return _mm_cvtss_f32(s1);
 }

+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
 inline double v_reduce_sum(const v_float64x4& a)
 {
    __m256d s0 = _mm256_hadd_pd(a.val, a.val);
@ -1166,26 +1186,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 }

 /** Popcount **/
-#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
-    inline v_uint32x8 v_popcount(const _Tpvec& a)                \
-    {                                                            \
-        const v_uint32x8 m1 = v256_setall_u32(0x55555555);       \
-        const v_uint32x8 m2 = v256_setall_u32(0x33333333);       \
-        const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f);       \
-        v_uint32x8 p  = v_reinterpret_as_u32(a);                 \
-        p = ((p >> 1) & m1) + (p & m1);                          \
-        p = ((p >> 2) & m2) + (p & m2);                          \
-        p = ((p >> 4) & m4) + (p & m4);                          \
-        p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256());  \
-        return p;                                                \
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{
+    __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+                                             0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+    __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
+    return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(                  a.val    , _popcnt_mask)),
+                                      _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
 }
-
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
-OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+}
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+}
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{
+    return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
+}
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }

 /** Mask **/
 inline int v_signmask(const v_int8x32& a)
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -603,27 +603,20 @@ static const unsigned char popCountTable[] =
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
 };
-/** @brief Count the 1 bits in the vector and return 4 values
+/** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type

 Scheme:
@code
-{A1 A2 A3 ...} => popcount(A1)
+{A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
@endcode
-Any types but result will be in v_uint32x4*/
-template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
+For all integer types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
 {
-    v_uint8x16 b;
-    b = v_reinterpret_as_u8(a);
-    for( int i = 0; i < v_uint8x16::nlanes; i++ )
-    {
-        b.s[i] = popCountTable[b.s[i]];
-    }
-    v_uint32x4 c;
-    for( int i = 0; i < v_uint32x4::nlanes; i++ )
-    {
-        c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
-    }
-    return c;
+    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
+    for (int i = 0; i < (int)(n*sizeof(_Tp)); i++)
+        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
+    return b;
 }


--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif

+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+}
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{
+    uint32x4_t t0 = vpaddlq_u16(a.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+    int32x4_t t0 = vpaddlq_s16(a.val);
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+}
+
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }

-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)

 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)

+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
 #if CV_SIMD128_64F
 inline double v_reduce_sum(const v_float64x2& a)
 {
@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
    return vget_lane_f32(vpadd_f32(t1, t1), 0);
 }

-#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
-inline v_uint32x4 v_popcount(const _Tpvec& a) \
-{ \
-    uint8x16_t t = vcntq_u8(cast(a.val)); \
-    uint16x8_t t0 = vpaddlq_u8(t);  /* 16 -> 8 */ \
-    uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
-    return v_uint32x4(t1); \
-}
-
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
-OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vcntq_u8(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }

 inline int v_signmask(const v_uint8x16& a)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(cast(a.val)); }

-OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)

+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    __m128i half = _mm_set1_epi8((schar)-128);
+    half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
+}
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
+inline schar v_reduce_##func(const v_int8x16& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi8((schar)-128); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
+} \
+inline uchar v_reduce_##func(const v_uint8x16& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (uchar)_mm_cvtsi128_si32(val); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
+
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
 { \
@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
    return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
 }
-#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
-inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
-{ \
-    __m128i val = a.val; \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
-    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
-    return (scalartype)_mm_cvtsi128_si32(val); \
-} \
-inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
-{ \
-    __m128i val = a.val; \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
-    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
-    return (unsigned scalartype)_mm_cvtsi128_si32(val); \
-}
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)

 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)

+inline int v_reduce_sum(const v_int16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
 inline double v_reduce_sum(const v_float64x2& a)
 {
    double CV_DECL_ALIGNED(32) idx[2];
@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
    return v_reduce_sum(v_absdiff(a, b));
 }

-#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
-inline v_uint32x4 v_popcount(const _Tpvec& a) \
-{ \
-    __m128i m1 = _mm_set1_epi32(0x55555555); \
-    __m128i m2 = _mm_set1_epi32(0x33333333); \
-    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
-    __m128i p = a.val; \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
-    p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
-    p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
-    return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    __m128i m1 = _mm_set1_epi32(0x55555555);
+    __m128i m2 = _mm_set1_epi32(0x33333333);
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
+    __m128i p = a.val;
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
+    return v_uint8x16(p);
 }
-
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
-OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }

 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
 inline int v_signmask(const _Tpvec& a) \
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 ////////// Reduce and mask /////////

 /** Reduce **/
-inline short v_reduce_sum(const v_int16x8& a)
+inline uint v_reduce_sum(const v_uint8x16& a)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(a.val, zero4);
+    return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_int4 sum4 = vec_sum4s(a.val, zero4);
+    return (int)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline int v_reduce_sum(const v_int16x8& a)
 {
    const vec_int4 zero = vec_int4_z;
-    return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
+    return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
 }
-inline ushort v_reduce_sum(const v_uint16x8& a)
+inline uint v_reduce_sum(const v_uint16x8& a)
 {
    const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
-    return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
+    return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
 }

 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)

+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
 inline double v_reduce_sum(const v_float64x2& a)
 {
    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)

+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    rs = func(rs, vec_sld(rs, rs, 2));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min)
+
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
 {
@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 }

 /** Popcount **/
-template<typename _Tpvec>
-inline v_uint32x4 v_popcount(const _Tpvec& a)
-{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }

 /** Mask **/
 inline int v_signmask(const v_uint8x16& a)
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)

    int i = 0;
    int result = 0;
-#if CV_AVX2
+
+#if CV_SIMD && CV_SIMD_WIDTH > 16
    {
-        __m256i _r0 = _mm256_setzero_si256();
-        __m256i _0 = _mm256_setzero_si256();
-        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
-
-        for(; i <= n - 32; i+= 32)
-        {
-            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
-
-            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
-            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
-                             _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
-
-            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
+        v_uint64 t = vx_setzero_u64();
+        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        result = (int)v_reduce_sum(t);
    }
-        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
-        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
-    }
-#endif // CV_AVX2
+#endif

 #if CV_POPCNT
    {
@ -68,18 +55,14 @@ int normHamming(const uchar* a, int n)
            result += CV_POPCNT_U32(*(uint*)(a + i));
        }
    }
-#endif // CV_POPCNT
-
-#if CV_SIMD128
+#elif CV_SIMD
    {
-        v_uint32x4 t = v_setzero_u32();
+        v_uint64x2 t = v_setzero_u64();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-        {
-            t += v_popcount(v_load(a + i));
+            t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
+        result += (int)v_reduce_sum(t);
    }
-        result += v_reduce_sum(t);
-    }
-#endif // CV_SIMD128
+#endif
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
@ -100,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)

    int i = 0;
    int result = 0;
-#if CV_AVX2
+
+#if CV_SIMD && CV_SIMD_WIDTH > 16
    {
-        __m256i _r0 = _mm256_setzero_si256();
-        __m256i _0 = _mm256_setzero_si256();
-        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
-
-        for(; i <= n - 32; i+= 32)
-        {
-            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
-            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
-
-            __m256i _xor = _mm256_xor_si256(_a0, _b0);
-
-            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
-            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
-                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
-
-            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
+        v_uint64 t = vx_setzero_u64();
+        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        result += (int)v_reduce_sum(t);
    }
-        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
-        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
-    }
-#endif // CV_AVX2
+#endif

 #if CV_POPCNT
    {
@ -139,18 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
-#endif // CV_POPCNT
-
-#if CV_SIMD128
+#elif CV_SIMD
    {
-        v_uint32x4 t = v_setzero_u32();
+        v_uint64x2 t = v_setzero_u64();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-        {
-            t += v_popcount(v_load(a + i) ^ v_load(b + i));
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        result += (int)v_reduce_sum(t);
    }
-        result += v_reduce_sum(t);
-    }
-#endif // CV_SIMD128
+#endif
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -1195,7 +1195,7 @@ CV_IMPL const char* cvErrorStr( int status )
    case CV_BadDepth :               return "Input image depth is not supported by function";
    case CV_StsUnmatchedFormats :    return "Formats of input arguments do not match";
    case CV_StsUnmatchedSizes :      return "Sizes of input arguments do not match";
-    case CV_StsOutOfRange :          return "One of arguments\' values is out of range";
+    case CV_StsOutOfRange :          return "One of the arguments\' values is out of range";
    case CV_StsUnsupportedFormat :   return "Unsupported format or combination of formats";
    case CV_BadCOI :                 return "Input COI is not supported";
    case CV_BadNumChannels :         return "Bad number of channels";
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -686,18 +686,24 @@ template<typename R> struct TheTest

    TheTest & test_popcount()
    {
+        typedef typename V_RegTraits<R>::u_reg Ru;
        static unsigned popcountTable[] = {
-            0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33,
-            35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81,
-            83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123,
-            128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172,
-            176, 181, 186, 192, 193
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, //0x00-0x0f
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x10-0x1f
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x20-0x2f
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x30-0x3f
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x40-0x4f
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x50-0x5f
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x60-0x6f
+            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //0x70-0x7f
+            1                                               //0x80
        };
        Data<R> dataA;
        R a = dataA;

-        unsigned resB = (unsigned)v_reduce_sum(v_popcount(a));
-        EXPECT_EQ(popcountTable[R::nlanes], resB);
+        Data<Ru> resB = v_popcount(a);
+        for (int i = 0; i < Ru::nlanes; ++i)
+            EXPECT_EQ(popcountTable[i + 1], resB[i]);

        return *this;
    }
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -2794,9 +2794,6 @@ AsyncMat Net::forwardAsync(const String& outputName)
 {
    CV_TRACE_FUNCTION();
 #ifdef CV_CXX11
-    if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
-        CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
-
    String layerName = outputName;

    if (layerName.empty())
@ -2805,6 +2802,9 @@ AsyncMat Net::forwardAsync(const String& outputName)
    std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
    impl->setUpNet(pins);

+    if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
+        CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
+
    impl->isAsync = true;
    impl->forwardToLayer(impl->getLayerData(layerName));
    impl->isAsync = false;
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -312,15 +312,13 @@ public:
    {
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        outs.getUMatVector(outputs);

        bool use_half = (inps.depth() == CV_16S);
        if (use_half)
        {
            std::vector<UMat> orig_inputs;
-            std::vector<UMat> orig_outputs;
-
            inps.getUMatVector(orig_inputs);
-            outs.getUMatVector(orig_outputs);

            inputs.resize(orig_inputs.size());
            for (size_t i = 0; i < orig_inputs.size(); i++)
@ -329,7 +327,6 @@ public:
        else
        {
            inps.getUMatVector(inputs);
-            outs.getUMatVector(outputs);
        }

        std::vector<LabelBBox> allDecodedBBoxes;
@ -362,19 +359,17 @@ public:

        if (numKept == 0)
        {
-            // Set confidences to zeros.
-            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
-            if (use_half)
-            {
-                std::vector<UMat> orig_outputs;
-                outs.getUMatVector(orig_outputs);
-                orig_outputs[0](ranges).setTo(0);
-            } else
-                outputs[0](ranges).setTo(0);
+            outputs[0].setTo(0);
            return true;
        }
-        int outputShape[] = {1, 1, (int)numKept, 7};
-        UMat umat = UMat(4, outputShape, CV_32F);
+
+        UMat umat = use_half ? UMat::zeros(4, outputs[0].size, CV_32F) : outputs[0];
+
+        if (!use_half)
+            umat.setTo(0);
+
+        // If there are valid detections
+        if (numKept > 0)
        {
            Mat mat = umat.getMat(ACCESS_WRITE);
            float* outputsData = mat.ptr<float>();
@ -393,16 +388,7 @@ public:
        {
            UMat half_umat;
            convertFp16(umat, half_umat);
-
-            std::vector<UMat> orig_outputs;
-            outs.getUMatVector(orig_outputs);
-            orig_outputs.clear();
-            orig_outputs.push_back(half_umat);
-            outs.assign(orig_outputs);
-        } else {
-            outputs.clear();
-            outputs.push_back(umat);
-            outs.assign(outputs);
+            outs.assign(std::vector<UMat>(1, half_umat));
        }

        return true;
@ -484,15 +470,12 @@ public:
            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
        }

+        outputs[0].setTo(0);
+
+        // If there is no detections
        if (numKept == 0)
-        {
-            // Set confidences to zeros.
-            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
-            outputs[0](ranges).setTo(0);
            return;
-        }
-        int outputShape[] = {1, 1, (int)numKept, 7};
-        outputs[0].create(4, outputShape, CV_32F);
+
        float* outputsData = outputs[0].ptr<float>();

        size_t count = 0;
@ -703,8 +686,6 @@ public:
                prior_width += 1.0f;
                prior_height += 1.0f;
            }
-            CV_Assert(prior_width > 0);
-            CV_Assert(prior_height > 0);
            float prior_center_x = prior_bbox.xmin + prior_width * .5;
            float prior_center_y = prior_bbox.ymin + prior_height * .5;

--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@ -131,6 +131,9 @@ public:
        CV_Assert(layerInternals.empty());
        internals.push_back(layerOutputs[0]);

+        // Detections layer.
+        internals.push_back(shape(1, 1, keepTopAfterNMS, 7));
+
        outputs.resize(2);
        outputs[0] = shape(keepTopAfterNMS, 5);
        outputs[1] = shape(keepTopAfterNMS, 1);
@ -176,13 +179,14 @@ public:
        internals_.getUMatVector(internals);

        CV_Assert(inputs.size() == 3);
-        CV_Assert(internals.size() == 3);
+        CV_Assert(internals.size() == 4);
        const UMat& scores = inputs[0];
        const UMat& bboxDeltas = inputs[1];
        const UMat& imInfo = inputs[2];
        UMat& priorBoxes = internals[0];
        UMat& permuttedScores = internals[1];
        UMat& permuttedDeltas = internals[2];
+        UMat& detections = internals[3];

        CV_Assert(imInfo.total() >= 2);
        // We've chosen the smallest data type because we need just a shape from it.
@ -217,7 +221,7 @@ public:
        layerInputs[2] = priorBoxes;
        layerInputs[3] = umat_fakeImageBlob;

-        layerOutputs[0] = UMat();
+        layerOutputs[0] = detections;
        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);

        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
@ -237,10 +241,6 @@ public:
        dst = outputs[1].rowRange(0, numDets);
        layerOutputs[0].col(2).copyTo(dst);

-        if (numDets < keepTopAfterNMS)
-            for (int i = 0; i < 2; ++i)
-                outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0);
-
        return true;
    }
 #endif
@ -266,13 +266,14 @@ public:
        internals_arr.getMatVector(internals);

        CV_Assert(inputs.size() == 3);
-        CV_Assert(internals.size() == 3);
+        CV_Assert(internals.size() == 4);
        const Mat& scores = inputs[0];
        const Mat& bboxDeltas = inputs[1];
        const Mat& imInfo = inputs[2];
        Mat& priorBoxes = internals[0];
        Mat& permuttedScores = internals[1];
        Mat& permuttedDeltas = internals[2];
+        Mat& detections = internals[3];

        CV_Assert(imInfo.total() >= 2);
        // We've chosen the smallest data type because we need just a shape from it.
@ -302,7 +303,7 @@ public:
        layerInputs[2] = priorBoxes;
        layerInputs[3] = fakeImageBlob;

-        layerOutputs[0] = Mat();
+        layerOutputs[0] = detections;
        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);

        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
@ -319,10 +320,6 @@ public:
        // The scores.
        dst = outputs[1].rowRange(0, numDets);
        layerOutputs[0].col(2).copyTo(dst);
-
-        if (numDets < keepTopAfterNMS)
-            for (int i = 0; i < 2; ++i)
-                outputs[i].rowRange(numDets, keepTopAfterNMS).setTo(0);
    }

    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -172,7 +172,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    float diffScores = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0;
    float diffSquares = (target == DNN_TARGET_MYRIAD) ? 0.063  : 0.0;
-    float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.252  : 0.0;
+    float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.252  : FLT_MIN;
         processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
                    inp, "detection_out", "", diffScores, diffSquares, detectionConfThresh);
    expectNoFallbacksFromIE(net);
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@ -204,7 +204,7 @@ TEST(Reproducibility_SSD, Accuracy)
    Mat out = net.forward("detection_out");

    Mat ref = blobFromNPY(_tf("ssd_out.npy"));
-    normAssertDetections(ref, out);
+    normAssertDetections(ref, out, "", FLT_MIN);
 }

 typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
@ -225,6 +225,8 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
    net.setInput(inp);
    Mat out = net.forward().clone();

+    ASSERT_EQ(out.size[2], 100);
+
    const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.5e-2 : 1e-5;
    const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 6.3e-2 : 1e-4;
    Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@ -341,11 +341,22 @@ public class JavaCamera2View extends CameraBridgeViewBase {


            if (chromaPixelStride == 2) { // Chroma channels are interleaved
+                assert(planes[0].getPixelStride() == 1);
+                assert(planes[2].getPixelStride() == 2);
                ByteBuffer y_plane = planes[0].getBuffer();
-                ByteBuffer uv_plane = planes[1].getBuffer();
+                ByteBuffer uv_plane1 = planes[1].getBuffer();
+                ByteBuffer uv_plane2 = planes[2].getBuffer();
                Mat y_mat = new Mat(h, w, CvType.CV_8UC1, y_plane);
-                Mat uv_mat = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane);
-                Imgproc.cvtColorTwoPlane(y_mat, uv_mat, mRgba, Imgproc.COLOR_YUV2RGBA_NV21);
+                Mat uv_mat1 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane1);
+                Mat uv_mat2 = new Mat(h / 2, w / 2, CvType.CV_8UC2, uv_plane2);
+                long addr_diff = uv_mat2.dataAddr() - uv_mat1.dataAddr();
+                if (addr_diff > 0) {
+                    assert(addr_diff == 1);
+                    Imgproc.cvtColorTwoPlane(y_mat, uv_mat1, mRgba, Imgproc.COLOR_YUV2RGBA_NV12);
+                } else {
+                    assert(addr_diff == -1);
+                    Imgproc.cvtColorTwoPlane(y_mat, uv_mat2, mRgba, Imgproc.COLOR_YUV2RGBA_NV21);
+                }
                return mRgba;
            } else { // Chroma channels are not interleaved
                byte[] yuv_bytes = new byte[w*(h+h/2)];
--- a/samples/CMakeLists.example.in
+++ b/samples/CMakeLists.example.in
@ -39,4 +39,4 @@ message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
 add_executable(${EXAMPLE_NAME} "${EXAMPLE_FILE}")

 # Link your application with OpenCV libraries
-target_link_libraries(${EXAMPLE_NAME} ${OpenCV_LIBS})
+target_link_libraries(${EXAMPLE_NAME} LINK_PRIVATE ${OpenCV_LIBS})
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -1,32 +1,3 @@
-# Utility function: adds sample executable target with name "example_<group>_<file_name>"
-# Usage:
-#   ocv_define_sample(<output target> <relative filename> <group>)
-function(ocv_define_sample out_target source sub)
-  get_filename_component(name "${source}" NAME_WE)
-  set(the_target "example_${sub}_${name}")
-  add_executable(${the_target} "${source}")
-  set_target_properties(${the_target} PROPERTIES PROJECT_LABEL "(sample) ${name}")
-  if(ENABLE_SOLUTION_FOLDERS)
-    set_target_properties(${the_target} PROPERTIES FOLDER "samples/${sub}")
-  endif()
-  if(WIN32 AND MSVC AND NOT BUILD_SHARED_LIBS)
-    set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
-  endif()
-  if(WIN32)
-    install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${sub}" COMPONENT samples)
-  endif()
-  # Add single target to build all samples in the group: 'make opencv_samples_cpp'
-  set(parent_target opencv_samples_${sub})
-  if(NOT TARGET ${parent_target})
-    add_custom_target(${parent_target})
-    if(TARGET opencv_samples)
-      add_dependencies(opencv_samples ${parent_target})
-    endif()
-  endif()
-  add_dependencies(${parent_target} ${the_target})
-  set(${out_target} ${the_target} PARENT_SCOPE)
-endfunction()
-
 if(NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_LIST_DIR)
 #===================================================================================================
 #
@ -34,6 +5,8 @@ if(NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_LIST_DIR)
 #
 #===================================================================================================

+include("${CMAKE_CURRENT_LIST_DIR}/samples_utils.cmake")
+
 function(ocv_install_example_src relpath)
  if(INSTALL_C_EXAMPLES)
    file(GLOB files ${ARGN})
@ -43,6 +16,10 @@ function(ocv_install_example_src relpath)
  endif()
 endfunction()

+if((TARGET Threads::Threads OR HAVE_PTHREAD OR MSVC OR APPLE) AND NOT OPENCV_EXAMPLES_DISABLE_THREADS)
+  add_definitions(-DHAVE_THREADS=1)
+endif()
+
 add_subdirectory(cpp)
 add_subdirectory(java/tutorial_code)
 add_subdirectory(dnn)
@ -98,6 +75,8 @@ option(BUILD_EXAMPLES "Build samples" ON)
 #     │   ├── cpp/
 find_package(OpenCV REQUIRED PATHS "..")

+include("${CMAKE_CURRENT_LIST_DIR}/samples_utils.cmake")
+
 function(ocv_install_example_src)
  # not used in this branch
 endfunction()
@ -129,6 +108,17 @@ endif()

 add_definitions(-DDISABLE_OPENCV_24_COMPATIBILITY=1)  # Avoid C-like legacy API

+if(OPENCV_EXAMPLES_DISABLE_THREADS)
+  # nothing
+elseif(MSVC OR APPLE)
+  set(HAVE_THREADS 1)
+else()
+  find_package(Threads)
+endif()
+if((TARGET Threads::Threads OR HAVE_THREADS) AND NOT OPENCV_EXAMPLES_DISABLE_THREADS)
+  add_definitions(-DHAVE_THREADS=1)
+endif()
+
 add_subdirectory(cpp)
 if(WIN32)
  add_subdirectory(directx)
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@ -35,12 +35,12 @@ foreach(sample_filename ${cpp_samples})
    set(package "tutorial")
  endif()
  ocv_define_sample(tgt ${sample_filename} ${package})
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
  if(sample_filename MATCHES "/gpu/" AND HAVE_opencv_cudaarithm AND HAVE_opencv_cuda_filters)
-    ocv_target_link_libraries(${tgt} opencv_cudaarithm opencv_cudafilters)
+    ocv_target_link_libraries(${tgt} LINK_PRIVATE opencv_cudaarithm opencv_cudafilters)
  endif()
  if(sample_filename MATCHES "/viz/")
-    ocv_target_link_libraries(${tgt} ${VTK_LIBRARIES})
+    ocv_target_link_libraries(${tgt} LINK_PRIVATE ${VTK_LIBRARIES})
    target_compile_definitions(${tgt} PRIVATE -DUSE_VTK)
  endif()
  if(HAVE_OPENGL AND sample_filename MATCHES "detect_mser")
--- a/samples/cpp/example_cmake/CMakeLists.txt
+++ b/samples/cpp/example_cmake/CMakeLists.txt
@ -27,4 +27,4 @@ message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
 add_executable(opencv_example example.cpp)

 # Link your application with OpenCV libraries
-target_link_libraries(opencv_example ${OpenCV_LIBS})
+target_link_libraries(opencv_example LINK_PRIVATE ${OpenCV_LIBS})
--- a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/CMakeLists.txt
+++ b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/CMakeLists.txt
@ -17,5 +17,5 @@ ocv_include_modules_recurse(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 add_executable( ${target}pnp_registration ${sample_dir}main_registration.cpp ${sample_pnplib} )
 add_executable( ${target}pnp_detection ${sample_dir}main_detection.cpp ${sample_pnplib} )

-ocv_target_link_libraries( ${target}pnp_registration ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS} )
-ocv_target_link_libraries( ${target}pnp_detection ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS} )
+ocv_target_link_libraries(${target}pnp_registration LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
+ocv_target_link_libraries(${target}pnp_detection LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
--- a/samples/cpp/tutorial_code/video/meanshift/camshift.cpp
+++ b/samples/cpp/tutorial_code/video/meanshift/camshift.cpp
@ -0,0 +1,86 @@
+#include <iostream>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    const string about =
+        "This sample demonstrates the camshift algorithm.\n"
+        "The example file can be downloaded from:\n"
+        "  https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4";
+    const string keys =
+        "{ h help |      | print this help message }"
+        "{ @image |<none>| path to image file }";
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+    string filename = parser.get<string>("@image");
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture capture(filename);
+    if (!capture.isOpened()){
+        //error in opening the video input
+        cerr << "Unable to open file!" << endl;
+        return 0;
+    }
+
+    Mat frame, roi, hsv_roi, mask;
+    // take first frame of the video
+    capture >> frame;
+
+    // setup initial location of window
+    Rect track_window(300, 200, 100, 50); // simply hardcoded the values
+
+    // set up the ROI for tracking
+    roi = frame(track_window);
+    cvtColor(roi, hsv_roi, COLOR_BGR2HSV);
+    inRange(hsv_roi, Scalar(0, 60, 32), Scalar(180, 255, 255), mask);
+
+    float range_[] = {0, 180};
+    const float* range[] = {range_};
+    Mat roi_hist;
+    int histSize[] = {180};
+    int channels[] = {0};
+    calcHist(&hsv_roi, 1, channels, mask, roi_hist, 1, histSize, range);
+    normalize(roi_hist, roi_hist, 0, 255, NORM_MINMAX);
+
+    // Setup the termination criteria, either 10 iteration or move by atleast 1 pt
+    TermCriteria term_crit(TermCriteria::EPS | TermCriteria::COUNT, 10, 1);
+
+    while(true){
+        Mat hsv, dst;
+        capture >> frame;
+        if (frame.empty())
+            break;
+        cvtColor(frame, hsv, COLOR_BGR2HSV);
+        calcBackProject(&hsv, 1, channels, roi_hist, dst, range);
+
+        // apply camshift to get the new location
+        RotatedRect rot_rect = CamShift(dst, track_window, term_crit);
+
+        // Draw it on image
+        Point2f points[4];
+        rot_rect.points(points);
+        for (int i = 0; i < 4; i++)
+            line(frame, points[i], points[(i+1)%4], 255, 2);
+        imshow("img2", frame);
+
+        int keyboard = waitKey(30);
+        if (keyboard == 'q' || keyboard == 27)
+            break;
+    }
+}
--- a/samples/cpp/tutorial_code/video/meanshift/meanshift.cpp
+++ b/samples/cpp/tutorial_code/video/meanshift/meanshift.cpp
@ -0,0 +1,83 @@
+#include <iostream>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    const string about =
+        "This sample demonstrates the meanshift algorithm.\n"
+        "The example file can be downloaded from:\n"
+        "  https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4";
+    const string keys =
+        "{ h help |      | print this help message }"
+        "{ @image |<none>| path to image file }";
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+    string filename = parser.get<string>("@image");
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture capture(filename);
+    if (!capture.isOpened()){
+        //error in opening the video input
+        cerr << "Unable to open file!" << endl;
+        return 0;
+    }
+
+    Mat frame, roi, hsv_roi, mask;
+    // take first frame of the video
+    capture >> frame;
+
+    // setup initial location of window
+    Rect track_window(300, 200, 100, 50); // simply hardcoded the values
+
+    // set up the ROI for tracking
+    roi = frame(track_window);
+    cvtColor(roi, hsv_roi, COLOR_BGR2HSV);
+    inRange(hsv_roi, Scalar(0, 60, 32), Scalar(180, 255, 255), mask);
+
+    float range_[] = {0, 180};
+    const float* range[] = {range_};
+    Mat roi_hist;
+    int histSize[] = {180};
+    int channels[] = {0};
+    calcHist(&hsv_roi, 1, channels, mask, roi_hist, 1, histSize, range);
+    normalize(roi_hist, roi_hist, 0, 255, NORM_MINMAX);
+
+    // Setup the termination criteria, either 10 iteration or move by atleast 1 pt
+    TermCriteria term_crit(TermCriteria::EPS | TermCriteria::COUNT, 10, 1);
+
+    while(true){
+        Mat hsv, dst;
+        capture >> frame;
+        if (frame.empty())
+            break;
+        cvtColor(frame, hsv, COLOR_BGR2HSV);
+        calcBackProject(&hsv, 1, channels, roi_hist, dst, range);
+
+        // apply meanshift to get the new location
+        meanShift(dst, track_window, term_crit);
+
+        // Draw it on image
+        rectangle(frame, track_window, 255, 2);
+        imshow("img2", frame);
+
+        int keyboard = waitKey(30);
+        if (keyboard == 'q' || keyboard == 27)
+            break;
+    }
+}
--- a/samples/cpp/tutorial_code/video/optical_flow/optical_flow.cpp
+++ b/samples/cpp/tutorial_code/video/optical_flow/optical_flow.cpp
@ -0,0 +1,101 @@
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    const string about =
+        "This sample demonstrates Lucas-Kanade Optical Flow calculation.\n"
+        "The example file can be downloaded from:\n"
+        "  https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4";
+    const string keys =
+        "{ h help |      | print this help message }"
+        "{ @image |<none>| path to image file }";
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+    string filename = parser.get<string>("@image");
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture capture(filename);
+    if (!capture.isOpened()){
+        //error in opening the video input
+        cerr << "Unable to open file!" << endl;
+        return 0;
+    }
+
+    // Create some random colors
+    vector<Scalar> colors;
+    RNG rng;
+    for(int i = 0; i < 100; i++)
+    {
+        int r = rng.uniform(0, 256);
+        int g = rng.uniform(0, 256);
+        int b = rng.uniform(0, 256);
+        colors.push_back(Scalar(r,g,b));
+    }
+
+    Mat old_frame, old_gray;
+    vector<Point2f> p0, p1;
+
+    // Take first frame and find corners in it
+    capture >> old_frame;
+    cvtColor(old_frame, old_gray, COLOR_BGR2GRAY);
+    goodFeaturesToTrack(old_gray, p0, 100, 0.3, 7, Mat(), 7, false, 0.04);
+
+    // Create a mask image for drawing purposes
+    Mat mask = Mat::zeros(old_frame.size(), old_frame.type());
+
+    while(true){
+        Mat frame, frame_gray;
+
+        capture >> frame;
+        if (frame.empty())
+            break;
+        cvtColor(frame, frame_gray, COLOR_BGR2GRAY);
+
+        // calculate optical flow
+        vector<uchar> status;
+        vector<float> err;
+        TermCriteria criteria = TermCriteria((TermCriteria::COUNT) + (TermCriteria::EPS), 10, 0.03);
+        calcOpticalFlowPyrLK(old_gray, frame_gray, p0, p1, status, err, Size(15,15), 2, criteria);
+
+        vector<Point2f> good_new;
+        for(uint i = 0; i < p0.size(); i++)
+        {
+            // Select good points
+            if(status[i] == 1) {
+                good_new.push_back(p1[i]);
+                // draw the tracks
+                line(mask,p1[i], p0[i], colors[i], 2);
+                circle(frame, p1[i], 5, colors[i], -1);
+            }
+        }
+        Mat img;
+        add(frame, mask, img);
+
+        imshow("Frame", img);
+
+        int keyboard = waitKey(30);
+        if (keyboard == 'q' || keyboard == 27)
+            break;
+
+        // Now update the previous frame and previous points
+        old_gray = frame_gray.clone();
+        p0 = good_new;
+    }
+}
--- a/samples/cpp/tutorial_code/video/optical_flow/optical_flow_dense.cpp
+++ b/samples/cpp/tutorial_code/video/optical_flow/optical_flow_dense.cpp
@ -0,0 +1,59 @@
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace std;
+
+int main()
+{
+    VideoCapture capture(samples::findFile("vtest.avi"));
+    if (!capture.isOpened()){
+        //error in opening the video input
+        cerr << "Unable to open file!" << endl;
+        return 0;
+    }
+
+    Mat frame1, prvs;
+    capture >> frame1;
+    cvtColor(frame1, prvs, COLOR_BGR2GRAY);
+
+    while(true){
+        Mat frame2, next;
+        capture >> frame2;
+        if (frame2.empty())
+            break;
+        cvtColor(frame2, next, COLOR_BGR2GRAY);
+
+        Mat flow(prvs.size(), CV_32FC2);
+        calcOpticalFlowFarneback(prvs, next, flow, 0.5, 3, 15, 3, 5, 1.2, 0);
+
+        // visualization
+        Mat flow_parts[2];
+        split(flow, flow_parts);
+        Mat magnitude, angle, magn_norm;
+        cartToPolar(flow_parts[0], flow_parts[1], magnitude, angle, true);
+        normalize(magnitude, magn_norm, 0.0f, 1.0f, NORM_MINMAX);
+        angle *= ((1.f / 360.f) * (180.f / 255.f));
+
+        //build hsv image
+        Mat _hsv[3], hsv, hsv8, bgr;
+        _hsv[0] = angle;
+        _hsv[1] = Mat::ones(angle.size(), CV_32F);
+        _hsv[2] = magn_norm;
+        merge(_hsv, 3, hsv);
+        hsv.convertTo(hsv8, CV_8U, 255.0);
+        cvtColor(hsv8, bgr, COLOR_HSV2BGR);
+
+        imshow("frame2", bgr);
+
+        int keyboard = waitKey(30);
+        if (keyboard == 'q' || keyboard == 27)
+            break;
+
+        prvs = next;
+    }
+}
--- a/samples/directx/CMakeLists.txt
+++ b/samples/directx/CMakeLists.txt
@ -17,5 +17,5 @@ ocv_include_modules_recurse(${tgt} ${OPENCV_DIRECTX_SAMPLES_REQUIRED_DEPS})
 file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${all_samples})
  ocv_define_sample(tgt ${sample_filename} directx)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_DIRECTX_SAMPLES_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_DIRECTX_SAMPLES_REQUIRED_DEPS})
 endforeach()
--- a/samples/dnn/CMakeLists.txt
+++ b/samples/dnn/CMakeLists.txt
@ -18,5 +18,5 @@ ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
 file(GLOB_RECURSE dnn_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${dnn_samples})
  ocv_define_sample(tgt ${sample_filename} dnn)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
 endforeach()
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@ -5,6 +5,11 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>

+#ifdef CV_CXX11
+#include <thread>
+#include <queue>
+#endif
+
 #include "common.hpp"

 std::string keys =
@ -26,8 +31,9 @@ std::string keys =
                         "0: CPU target (by default), "
                         "1: OpenCL, "
                         "2: OpenCL fp16 (half-float precision), "
-                         "3: VPU }";
-
+                         "3: VPU }"
+    "{ async       | 0 | Number of asynchronous forwards at the same time. "
+                        "Choose 0 for synchronous mode }";

 using namespace cv;
 using namespace dnn;
@ -35,13 +41,66 @@ using namespace dnn;
 float confThreshold, nmsThreshold;
 std::vector<std::string> classes;

+inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
+                       const Scalar& mean, bool swapRB);
+
 void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);

 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);

 void callback(int pos, void* userdata);

-std::vector<String> getOutputsNames(const Net& net);
+#ifdef CV_CXX11
+template <typename T>
+class QueueFPS : public std::queue<T>
+{
+public:
+    QueueFPS() : counter(0) {}
+
+    void push(const T& entry)
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+
+        std::queue<T>::push(entry);
+        counter += 1;
+        if (counter == 1)
+        {
+            // Start counting from a second frame (warmup).
+            tm.reset();
+            tm.start();
+        }
+    }
+
+    T get()
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        T entry = this->front();
+        this->pop();
+        return entry;
+    }
+
+    float getFPS()
+    {
+        tm.stop();
+        double fps = counter / tm.getTimeSec();
+        tm.start();
+        return static_cast<float>(fps);
+    }
+
+    void clear()
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        while (!this->empty())
+            this->pop();
+    }
+
+    unsigned int counter;
+
+private:
+    TickMeter tm;
+    std::mutex mutex;
+};
+#endif  // CV_CXX11

 int main(int argc, char** argv)
 {
@ -67,6 +126,7 @@ int main(int argc, char** argv)
    bool swapRB = parser.get<bool>("rgb");
    int inpWidth = parser.get<int>("width");
    int inpHeight = parser.get<int>("height");
+    size_t async = parser.get<int>("async");
    CV_Assert(parser.has("model"));
    std::string modelPath = findFile(parser.get<String>("model"));
    std::string configPath = findFile(parser.get<String>("config"));
@ -104,6 +164,108 @@ int main(int argc, char** argv)
    else
        cap.open(parser.get<int>("device"));

+#ifdef CV_CXX11
+    bool process = true;
+
+    // Frames capturing thread
+    QueueFPS<Mat> framesQueue;
+    std::thread framesThread([&](){
+        Mat frame;
+        while (process)
+        {
+            cap >> frame;
+            if (!frame.empty())
+                framesQueue.push(frame.clone());
+            else
+                break;
+        }
+    });
+
+    // Frames processing thread
+    QueueFPS<Mat> processedFramesQueue;
+    QueueFPS<std::vector<Mat> > predictionsQueue;
+    std::thread processingThread([&](){
+        std::queue<std::future<Mat> > futureOutputs;
+        Mat blob;
+        while (process)
+        {
+            // Get a next frame
+            Mat frame;
+            {
+                if (!framesQueue.empty())
+                {
+                    frame = framesQueue.get();
+                    if (async)
+                    {
+                        if (futureOutputs.size() == async)
+                            frame = Mat();
+                    }
+                    else
+                        framesQueue.clear();  // Skip the rest of frames
+                }
+            }
+
+            // Process the frame
+            if (!frame.empty())
+            {
+                preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
+                processedFramesQueue.push(frame);
+
+                if (async)
+                {
+                    futureOutputs.push(net.forwardAsync());
+                }
+                else
+                {
+                    std::vector<Mat> outs;
+                    net.forward(outs, outNames);
+                    predictionsQueue.push(outs);
+                }
+            }
+
+            while (!futureOutputs.empty() &&
+                   futureOutputs.front().wait_for(std::chrono::seconds(0)) == std::future_status::ready)
+            {
+                Mat out = futureOutputs.front().get();
+                predictionsQueue.push({out});
+                futureOutputs.pop();
+            }
+        }
+    });
+
+    // Postprocessing and rendering loop
+    while (waitKey(1) < 0)
+    {
+        if (predictionsQueue.empty())
+            continue;
+
+        std::vector<Mat> outs = predictionsQueue.get();
+        Mat frame = processedFramesQueue.get();
+
+        postprocess(frame, outs, net);
+
+        if (predictionsQueue.counter > 1)
+        {
+            std::string label = format("Camera: %.2f FPS", framesQueue.getFPS());
+            putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+            label = format("Network: %.2f FPS", predictionsQueue.getFPS());
+            putText(frame, label, Point(0, 30), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+            label = format("Skipped frames: %d", framesQueue.counter - predictionsQueue.counter);
+            putText(frame, label, Point(0, 45), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        }
+        imshow(kWinName, frame);
+    }
+
+    process = false;
+    framesThread.join();
+    processingThread.join();
+
+#else  // CV_CXX11
+    if (async)
+        CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend.");
+
    // Process frames.
    Mat frame, blob;
    while (waitKey(1) < 0)
@ -115,19 +277,8 @@ int main(int argc, char** argv)
            break;
        }

-        // Create a 4D blob from a frame.
-        Size inpSize(inpWidth > 0 ? inpWidth : frame.cols,
-                     inpHeight > 0 ? inpHeight : frame.rows);
-        blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false);
+        preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);

-        // Run a model.
-        net.setInput(blob);
-        if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
-        {
-            resize(frame, frame, inpSize);
-            Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
-            net.setInput(imInfo, "im_info");
-        }
        std::vector<Mat> outs;
        net.forward(outs, outNames);

@ -142,9 +293,29 @@ int main(int argc, char** argv)

        imshow(kWinName, frame);
    }
+#endif  // CV_CXX11
    return 0;
 }

+inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
+                       const Scalar& mean, bool swapRB)
+{
+    static Mat blob;
+    // Create a 4D blob from a frame.
+    if (inpSize.width <= 0) inpSize.width = frame.cols;
+    if (inpSize.height <= 0) inpSize.height = frame.rows;
+    blobFromImage(frame, blob, 1.0, inpSize, Scalar(), swapRB, false, CV_8U);
+
+    // Run a model.
+    net.setInput(blob, "", scale, mean);
+    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
+    {
+        resize(frame, frame, inpSize);
+        Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
+        net.setInput(imInfo, "im_info");
+    }
+}
+
 void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
 {
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -1,6 +1,13 @@
 import cv2 as cv
 import argparse
 import numpy as np
+import sys
+import time
+from threading import Thread
+if sys.version_info[0] == '2':
+    import Queue as queue
+else:
+    import queue

 from common import *
 from tf_text_graph_common import readTextMessage
@ -35,6 +42,9 @@ parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU,
                         '%d: OpenCL, '
                         '%d: OpenCL fp16 (half-float precision), '
                         '%d: VPU' % targets)
+parser.add_argument('--async', type=int, default=0,
+                    help='Number of asynchronous forwards at the same time. '
+                         'Choose 0 for synchronous mode')
 args, _ = parser.parse_known_args()
 add_preproc_args(args.zoo, parser, 'object_detection')
 parser = argparse.ArgumentParser(parents=[parser],
@ -173,32 +183,125 @@ def callback(pos):
 cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)

 cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
-while cv.waitKey(1) < 0:
+
+class QueueFPS(queue.Queue):
+    def __init__(self):
+        queue.Queue.__init__(self)
+        self.startTime = 0
+        self.counter = 0
+
+    def put(self, v):
+        queue.Queue.put(self, v)
+        self.counter += 1
+        if self.counter == 1:
+            self.startTime = time.time()
+
+    def getFPS(self):
+        return self.counter / (time.time() - self.startTime)
+
+
+process = True
+
+#
+# Frames capturing thread
+#
+framesQueue = QueueFPS()
+def framesThreadBody():
+    global framesQueue, process
+
+    while process:
        hasFrame, frame = cap.read()
        if not hasFrame:
-        cv.waitKey()
            break
+        framesQueue.put(frame)

+
+#
+# Frames processing thread
+#
+processedFramesQueue = queue.Queue()
+predictionsQueue = QueueFPS()
+def processingThreadBody():
+    global processedFramesQueue, predictionsQueue, args, process
+
+    futureOutputs = []
+    while process:
+        # Get a next frame
+        frame = None
+        try:
+            frame = framesQueue.get_nowait()
+
+            if args.async:
+                if len(futureOutputs) == args.async:
+                    frame = None  # Skip the frame
+            else:
+                framesQueue.queue.clear()  # Skip the rest of frames
+        except queue.Empty:
+            pass
+
+
+        if not frame is None:
            frameHeight = frame.shape[0]
            frameWidth = frame.shape[1]

            # Create a 4D blob from a frame.
            inpWidth = args.width if args.width else frameWidth
            inpHeight = args.height if args.height else frameHeight
-    blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
+            blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_8U)
+            processedFramesQueue.put(frame)

            # Run a model
-    net.setInput(blob)
+            net.setInput(blob, scalefactor=args.scale, mean=args.mean)
            if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
                frame = cv.resize(frame, (inpWidth, inpHeight))
                net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
+
+            if args.async:
+                futureOutputs.append(net.forwardAsync())
+            else:
                outs = net.forward(outNames)
+                predictionsQueue.put(np.copy(outs))
+
+        while futureOutputs and futureOutputs[0].wait_for(0) == 0:
+            out = futureOutputs[0].get()
+            predictionsQueue.put(np.copy([out]))
+
+            del futureOutputs[0]
+
+
+framesThread = Thread(target=framesThreadBody)
+framesThread.start()
+
+processingThread = Thread(target=processingThreadBody)
+processingThread.start()
+
+#
+# Postprocessing and rendering loop
+#
+while cv.waitKey(1) < 0:
+    try:
+        # Request prediction first because they put after frames
+        outs = predictionsQueue.get_nowait()
+        frame = processedFramesQueue.get_nowait()

        postprocess(frame, outs)

        # Put efficiency information.
-    t, _ = net.getPerfProfile()
-    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
+        if predictionsQueue.counter > 1:
+            label = 'Camera: %.2f FPS' % (framesQueue.getFPS())
            cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))

+            label = 'Network: %.2f FPS' % (predictionsQueue.getFPS())
+            cv.putText(frame, label, (0, 30), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+
+            label = 'Skipped frames: %d' % (framesQueue.counter - predictionsQueue.counter)
+            cv.putText(frame, label, (0, 45), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
+
        cv.imshow(winName, frame)
+    except queue.Empty:
+        pass
+
+
+process = False
+framesThread.join()
+processingThread.join()
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@ -51,11 +51,11 @@ endif()
 file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${all_samples})
  ocv_define_sample(tgt ${sample_filename} gpu)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
  if(HAVE_opencv_xfeatures2d)
-    ocv_target_link_libraries(${tgt} opencv_xfeatures2d)
+    ocv_target_link_libraries(${tgt} LINK_PRIVATE opencv_xfeatures2d)
  endif()
  if(HAVE_opencv_cudacodec)
-    ocv_target_link_libraries(${tgt} opencv_cudacodec)
+    ocv_target_link_libraries(${tgt} LINK_PRIVATE opencv_cudacodec)
  endif()
 endforeach()
--- a/samples/opencl/CMakeLists.txt
+++ b/samples/opencl/CMakeLists.txt
@ -31,7 +31,7 @@ ocv_include_directories(${OpenCL_INCLUDE_DIR})
 file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${all_samples})
  ocv_define_sample(tgt ${sample_filename} opencl)
-  ocv_target_link_libraries(${tgt}
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE
    ${OPENCV_LINKER_LIBS}
    ${OPENCV_OPENCL_SAMPLES_REQUIRED_DEPS}
    ${OpenCL_LIBRARY})
--- a/samples/opengl/CMakeLists.txt
+++ b/samples/opengl/CMakeLists.txt
@ -23,9 +23,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
  endif()
  foreach(sample_filename ${all_samples})
    ocv_define_sample(tgt ${sample_filename} opengl)
-    ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_OPENGL_SAMPLES_REQUIRED_DEPS})
+    ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_OPENGL_SAMPLES_REQUIRED_DEPS})
    if(sample_filename STREQUAL "opengl_interop.cpp")
-      ocv_target_link_libraries(${tgt} ${X11_LIBRARIES})
+      ocv_target_link_libraries(${tgt} LINK_PRIVATE ${X11_LIBRARIES})
      ocv_target_include_directories(${tgt} ${X11_INCLUDE_DIR})
    endif()
  endforeach()
--- a/samples/openvx/CMakeLists.txt
+++ b/samples/openvx/CMakeLists.txt
@ -21,5 +21,5 @@ add_definitions(-DIVX_HIDE_INFO_WARNINGS)
 file(GLOB_RECURSE cpp_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${cpp_samples})
  ocv_define_sample(tgt ${sample_filename} openvx)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_OPENVX_SAMPLE_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_OPENVX_SAMPLE_REQUIRED_DEPS})
 endforeach()
--- a/samples/python/tutorial_code/video/meanshift/camshift.py
+++ b/samples/python/tutorial_code/video/meanshift/camshift.py
@ -0,0 +1,50 @@
+import numpy as np
+import cv2 as cv
+import argparse
+
+parser = argparse.ArgumentParser(description='This sample demonstrates the camshift algorithm. \
+                                              The example file can be downloaded from: \
+                                              https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4')
+parser.add_argument('image', type=str, help='path to image file')
+args = parser.parse_args()
+
+cap = cv.VideoCapture(args.image)
+
+# take first frame of the video
+ret,frame = cap.read()
+
+# setup initial location of window
+x, y, w, h = 300, 200, 100, 50 # simply hardcoded the values
+track_window = (x, y, w, h)
+
+# set up the ROI for tracking
+roi = frame[y:y+h, x:x+w]
+hsv_roi =  cv.cvtColor(roi, cv.COLOR_BGR2HSV)
+mask = cv.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
+roi_hist = cv.calcHist([hsv_roi],[0],mask,[180],[0,180])
+cv.normalize(roi_hist,roi_hist,0,255,cv.NORM_MINMAX)
+
+# Setup the termination criteria, either 10 iteration or move by atleast 1 pt
+term_crit = ( cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 1 )
+
+while(1):
+    ret, frame = cap.read()
+
+    if ret == True:
+        hsv = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
+        dst = cv.calcBackProject([hsv],[0],roi_hist,[0,180],1)
+
+        # apply camshift to get the new location
+        ret, track_window = cv.CamShift(dst, track_window, term_crit)
+
+        # Draw it on image
+        pts = cv.boxPoints(ret)
+        pts = np.int0(pts)
+        img2 = cv.polylines(frame,[pts],True, 255,2)
+        cv.imshow('img2',img2)
+
+        k = cv.waitKey(30) & 0xff
+        if k == 27:
+            break
+    else:
+        break
--- a/samples/python/tutorial_code/video/meanshift/meanshift.py
+++ b/samples/python/tutorial_code/video/meanshift/meanshift.py
@ -0,0 +1,49 @@
+import numpy as np
+import cv2 as cv
+import argparse
+
+parser = argparse.ArgumentParser(description='This sample demonstrates the meanshift algorithm. \
+                                              The example file can be downloaded from: \
+                                              https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4')
+parser.add_argument('image', type=str, help='path to image file')
+args = parser.parse_args()
+
+cap = cv.VideoCapture(args.image)
+
+# take first frame of the video
+ret,frame = cap.read()
+
+# setup initial location of window
+x, y, w, h = 300, 200, 100, 50 # simply hardcoded the values
+track_window = (x, y, w, h)
+
+# set up the ROI for tracking
+roi = frame[y:y+h, x:x+w]
+hsv_roi =  cv.cvtColor(roi, cv.COLOR_BGR2HSV)
+mask = cv.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
+roi_hist = cv.calcHist([hsv_roi],[0],mask,[180],[0,180])
+cv.normalize(roi_hist,roi_hist,0,255,cv.NORM_MINMAX)
+
+# Setup the termination criteria, either 10 iteration or move by atleast 1 pt
+term_crit = ( cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 1 )
+
+while(1):
+    ret, frame = cap.read()
+
+    if ret == True:
+        hsv = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
+        dst = cv.calcBackProject([hsv],[0],roi_hist,[0,180],1)
+
+        # apply meanshift to get the new location
+        ret, track_window = cv.meanShift(dst, track_window, term_crit)
+
+        # Draw it on image
+        x,y,w,h = track_window
+        img2 = cv.rectangle(frame, (x,y), (x+w,y+h), 255,2)
+        cv.imshow('img2',img2)
+
+        k = cv.waitKey(30) & 0xff
+        if k == 27:
+            break
+    else:
+        break
--- a/samples/python/tutorial_code/video/optical_flow/optical_flow.py
+++ b/samples/python/tutorial_code/video/optical_flow/optical_flow.py
@ -0,0 +1,61 @@
+import numpy as np
+import cv2 as cv
+import argparse
+
+parser = argparse.ArgumentParser(description='This sample demonstrates Lucas-Kanade Optical Flow calculation. \
+                                              The example file can be downloaded from: \
+                                              https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4')
+parser.add_argument('image', type=str, help='path to image file')
+args = parser.parse_args()
+
+cap = cv.VideoCapture(args.image)
+
+# params for ShiTomasi corner detection
+feature_params = dict( maxCorners = 100,
+                       qualityLevel = 0.3,
+                       minDistance = 7,
+                       blockSize = 7 )
+
+# Parameters for lucas kanade optical flow
+lk_params = dict( winSize  = (15,15),
+                  maxLevel = 2,
+                  criteria = (cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 0.03))
+
+# Create some random colors
+color = np.random.randint(0,255,(100,3))
+
+# Take first frame and find corners in it
+ret, old_frame = cap.read()
+old_gray = cv.cvtColor(old_frame, cv.COLOR_BGR2GRAY)
+p0 = cv.goodFeaturesToTrack(old_gray, mask = None, **feature_params)
+
+# Create a mask image for drawing purposes
+mask = np.zeros_like(old_frame)
+
+while(1):
+    ret,frame = cap.read()
+    frame_gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
+
+    # calculate optical flow
+    p1, st, err = cv.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params)
+
+    # Select good points
+    good_new = p1[st==1]
+    good_old = p0[st==1]
+
+    # draw the tracks
+    for i,(new,old) in enumerate(zip(good_new, good_old)):
+        a,b = new.ravel()
+        c,d = old.ravel()
+        mask = cv.line(mask, (a,b),(c,d), color[i].tolist(), 2)
+        frame = cv.circle(frame,(a,b),5,color[i].tolist(),-1)
+    img = cv.add(frame,mask)
+
+    cv.imshow('frame',img)
+    k = cv.waitKey(30) & 0xff
+    if k == 27:
+        break
+
+    # Now update the previous frame and previous points
+    old_gray = frame_gray.copy()
+    p0 = good_new.reshape(-1,1,2)
--- a/samples/python/tutorial_code/video/optical_flow/optical_flow_dense.py
+++ b/samples/python/tutorial_code/video/optical_flow/optical_flow_dense.py
@ -0,0 +1,23 @@
+import numpy as np
+import cv2 as cv
+cap = cv.VideoCapture(cv.samples.findFile("vtest.avi"))
+ret, frame1 = cap.read()
+prvs = cv.cvtColor(frame1,cv.COLOR_BGR2GRAY)
+hsv = np.zeros_like(frame1)
+hsv[...,1] = 255
+while(1):
+    ret, frame2 = cap.read()
+    next = cv.cvtColor(frame2,cv.COLOR_BGR2GRAY)
+    flow = cv.calcOpticalFlowFarneback(prvs,next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+    mag, ang = cv.cartToPolar(flow[...,0], flow[...,1])
+    hsv[...,0] = ang*180/np.pi/2
+    hsv[...,2] = cv.normalize(mag,None,0,255,cv.NORM_MINMAX)
+    bgr = cv.cvtColor(hsv,cv.COLOR_HSV2BGR)
+    cv.imshow('frame2',bgr)
+    k = cv.waitKey(30) & 0xff
+    if k == 27:
+        break
+    elif k == ord('s'):
+        cv.imwrite('opticalfb.png',frame2)
+        cv.imwrite('opticalhsv.png',bgr)
+    prvs = next
--- a/samples/samples_utils.cmake
+++ b/samples/samples_utils.cmake
@ -0,0 +1,31 @@
+# Utility function: adds sample executable target with name "example_<group>_<file_name>"
+# Usage:
+#   ocv_define_sample(<output target> <relative filename> <group>)
+function(ocv_define_sample out_target source sub)
+  get_filename_component(name "${source}" NAME_WE)
+  set(the_target "example_${sub}_${name}")
+  add_executable(${the_target} "${source}")
+  if(TARGET Threads::Threads AND NOT OPENCV_EXAMPLES_DISABLE_THREADS)
+    target_link_libraries(${the_target} LINK_PRIVATE Threads::Threads)
+  endif()
+  set_target_properties(${the_target} PROPERTIES PROJECT_LABEL "(sample) ${name}")
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "samples/${sub}")
+  endif()
+  if(WIN32 AND MSVC AND NOT BUILD_SHARED_LIBS)
+    set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+  endif()
+  if(WIN32)
+    install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${sub}" COMPONENT samples)
+  endif()
+  # Add single target to build all samples in the group: 'make opencv_samples_cpp'
+  set(parent_target opencv_samples_${sub})
+  if(NOT TARGET ${parent_target})
+    add_custom_target(${parent_target})
+    if(TARGET opencv_samples)
+      add_dependencies(opencv_samples ${parent_target})
+    endif()
+  endif()
+  add_dependencies(${parent_target} ${the_target})
+  set(${out_target} ${the_target} PARENT_SCOPE)
+endfunction()
--- a/samples/tapi/CMakeLists.txt
+++ b/samples/tapi/CMakeLists.txt
@ -22,5 +22,5 @@ ocv_include_modules_recurse(${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${all_samples})
  ocv_define_sample(tgt ${sample_filename} tapi)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 endforeach()
--- a/samples/va_intel/CMakeLists.txt
+++ b/samples/va_intel/CMakeLists.txt
@ -17,5 +17,5 @@ ocv_include_modules_recurse(${OPENCV_VA_INTEL_SAMPLES_REQUIRED_DEPS})
 file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
 foreach(sample_filename ${all_samples})
  ocv_define_sample(tgt ${sample_filename} va_intel)
-  ocv_target_link_libraries(${tgt} ${OPENCV_LINKER_LIBS} ${OPENCV_VA_INTEL_SAMPLES_REQUIRED_DEPS} ${VA_LIBRARIES} ${VA_INTEL_LIBRARIES})
+  ocv_target_link_libraries(${tgt} LINK_PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_VA_INTEL_SAMPLES_REQUIRED_DEPS} ${VA_LIBRARIES} ${VA_INTEL_LIBRARIES})
 endforeach()