diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 0c9f709b15..26bf624c66 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -21,7 +21,7 @@ source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
 source_group("Device"         FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
 
-if (HAVE_CUDA)
+if(HAVE_CUDA)
   file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
   file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
   set(ncv_files ${ncv_srcs} ${ncv_cuda})
@@ -104,3 +104,7 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
                        FILES "Src" ${test_srcs}
                        ${nvidia})
 ocv_add_perf_tests()
+
+if(HAVE_CUDA)
+  add_subdirectory(perf4au)
+endif()
diff --git a/modules/gpu/doc/data_structures.rst b/modules/gpu/doc/data_structures.rst
index 68e702a793..1291cf9bb6 100644
--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
@@ -271,41 +271,37 @@ This class encapsulates a queue of asynchronous calls. Some functions have overl
     class CV_EXPORTS Stream
     {
     public:
-            Stream();
-            ~Stream();
+        Stream();
+        ~Stream();
 
-            Stream(const Stream&);
-            Stream& operator=(const Stream&);
+        Stream(const Stream&);
+        Stream& operator=(const Stream&);
 
-            bool queryIfComplete();
-            void waitForCompletion();
+        bool queryIfComplete();
+        void waitForCompletion();
 
-            //! downloads asynchronously.
-            // Warning! cv::Mat must point to page locked memory
-                     (i.e. to CudaMem data or to its subMat)
-            void enqueueDownload(const GpuMat& src, CudaMem& dst);
-            void enqueueDownload(const GpuMat& src, Mat& dst);
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
 
-            //! uploads asynchronously.
-            // Warning! cv::Mat must point to page locked memory
-                     (i.e. to CudaMem data or to its ROI)
-            void enqueueUpload(const CudaMem& src, GpuMat& dst);
-            void enqueueUpload(const Mat& src, GpuMat& dst);
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
 
-            void enqueueCopy(const GpuMat& src, GpuMat& dst);
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
 
-            void enqueueMemSet(const GpuMat& src, Scalar val);
-            void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
+        void enqueueMemSet(const GpuMat& src, Scalar val);
+        void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
 
-            // converts matrix type, ex from float to uchar depending on type
-            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type,
-                    double a = 1, double b = 0);
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int type,
+                            double a = 1, double b = 0);
+
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
     };
 
 
 
 gpu::Stream::queryIfComplete
---------------------------------
+----------------------------
 Returns ``true`` if the current stream queue is finished. Otherwise, it returns false.
 
 .. ocv:function:: bool gpu::Stream::queryIfComplete()
@@ -313,13 +309,73 @@ Returns ``true`` if the current stream queue is finished. Otherwise, it returns
 
 
 gpu::Stream::waitForCompletion
-----------------------------------
+------------------------------
 Blocks the current CPU thread until all operations in the stream are complete.
 
 .. ocv:function:: void gpu::Stream::waitForCompletion()
 
 
 
+gpu::Stream::enqueueDownload
+----------------------------
+Copies data from device to host.
+
+.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
+
+.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
+
+.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
+
+
+
+gpu::Stream::enqueueUpload
+--------------------------
+Copies data from host to device.
+
+.. ocv:function:: void gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
+
+.. ocv:function:: void gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
+
+.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
+
+
+
+gpu::Stream::enqueueCopy
+------------------------
+Copies data from device to device.
+
+.. ocv:function:: void gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
+
+
+
+gpu::Stream::enqueueMemSet
+--------------------------
+Initializes or sets device memory to a value.
+
+.. ocv:function:: void gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)
+
+.. ocv:function:: void gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
+
+
+
+gpu::Stream::enqueueConvert
+---------------------------
+Converts matrix type, ex from float to uchar depending on type.
+
+.. ocv:function:: void gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0)
+
+
+
+gpu::Stream::enqueueHostCallback
+--------------------------------
+Adds a callback to be called on the host after all currently enqueued items in the stream have completed.
+
+.. ocv:function:: void gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+
+.. note:: Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization that may depend on outstanding device work or other callbacks that are not mandated to run earlier.  Callbacks without a mandated order (in independent streams) execute in undefined order and may be serialized.
+
+
+
 gpu::StreamAccessor
 -------------------
 .. ocv:struct:: gpu::StreamAccessor
diff --git a/modules/gpu/doc/matrix_reductions.rst b/modules/gpu/doc/matrix_reductions.rst
index 538267eb7a..e9229f8a81 100644
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -32,6 +32,8 @@ Returns the norm of a matrix (or difference of two matrices).
 
 .. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
 
+.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
+
 .. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
 
     :param src1: Source matrix. Any matrices except 64F are supported.
@@ -40,6 +42,8 @@ Returns the norm of a matrix (or difference of two matrices).
 
     :param normType: Norm type.  ``NORM_L1`` ,  ``NORM_L2`` , and  ``NORM_INF``  are supported for now.
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 .. seealso:: :ocv:func:`norm`
@@ -54,8 +58,12 @@ Returns the sum of matrix elements.
 
 .. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 .. seealso:: :ocv:func:`sum`
@@ -70,8 +78,12 @@ Returns the sum of absolute values for matrix elements.
 
 .. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 
@@ -84,8 +96,12 @@ Returns the squared sum of matrix elements.
 
 .. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 
diff --git a/modules/gpu/doc/operations_on_matrices.rst b/modules/gpu/doc/operations_on_matrices.rst
index 7f586a1b02..d1762f442a 100644
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@@ -242,3 +242,33 @@ Converts polar coordinates into Cartesian.
     :param stream: Stream for the asynchronous version.
 
 .. seealso:: :ocv:func:`polarToCart`
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: input array.
+
+    :param dst: output array of the same size as  ``src`` .
+
+    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: normalization type (see the details below).
+
+    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
diff --git a/modules/gpu/doc/per_element_operations.rst b/modules/gpu/doc/per_element_operations.rst
index a59875e646..2670ba3233 100644
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -276,6 +276,8 @@ Compares elements of two matrices.
 
 .. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
 
+.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
+
     :param a: First source matrix.
 
     :param b: Second source matrix with the same size and type as ``a`` .
diff --git a/modules/gpu/include/opencv2/gpu/device/warp.hpp b/modules/gpu/include/opencv2/gpu/device/warp.hpp
index d4b0b8d8f7..0f1dc794ab 100644
--- a/modules/gpu/include/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp.hpp
@@ -97,6 +97,25 @@ namespace cv { namespace gpu { namespace device
             return out;
         }
 
+        template <class T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T *ptr, BinOp op)
+        {
+            const unsigned int lane = laneId();
+
+            if (lane < 16)
+            {
+                T partial = ptr[lane];
+
+                ptr[lane] = partial = op(partial, ptr[lane + 16]);
+                ptr[lane] = partial = op(partial, ptr[lane + 8]);
+                ptr[lane] = partial = op(partial, ptr[lane + 4]);
+                ptr[lane] = partial = op(partial, ptr[lane + 2]);
+                ptr[lane] = partial = op(partial, ptr[lane + 1]);
+            }
+
+            return *ptr;
+        }
+
         template<typename OutIt, typename T>
         static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
         {
@@ -109,4 +128,4 @@ namespace cv { namespace gpu { namespace device
     };
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 60cff99f6c..7cc57e49af 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -145,43 +145,49 @@ public:
     ~Stream();
 
     Stream(const Stream&);
-    Stream& operator=(const Stream&);
+    Stream& operator =(const Stream&);
 
     bool queryIfComplete();
     void waitForCompletion();
 
-    //! downloads asynchronously.
+    //! downloads asynchronously
     // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
     void enqueueDownload(const GpuMat& src, CudaMem& dst);
     void enqueueDownload(const GpuMat& src, Mat& dst);
 
-    //! uploads asynchronously.
+    //! uploads asynchronously
     // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
     void enqueueUpload(const CudaMem& src, GpuMat& dst);
     void enqueueUpload(const Mat& src, GpuMat& dst);
 
+    //! copy asynchronously
     void enqueueCopy(const GpuMat& src, GpuMat& dst);
 
+    //! memory set asynchronously
     void enqueueMemSet(GpuMat& src, Scalar val);
     void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
 
-    // converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
+    //! converts matrix type, ex from float to uchar depending on type
+    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+    void enqueueHostCallback(StreamCallback callback, void* userData);
 
     static Stream& Null();
 
     operator bool() const;
 
 private:
+    struct Impl;
+
+    explicit Stream(Impl* impl);
     void create();
     void release();
 
-    struct Impl;
     Impl *impl;
 
     friend struct StreamAccessor;
-
-    explicit Stream(Impl* impl);
 };
 
 
@@ -459,6 +465,12 @@ CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude,
 //! supports only floating-point source
 CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
+                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
+                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+
 
 //////////////////////////// Per-element operations ////////////////////////////////////
 
@@ -527,6 +539,7 @@ CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream
 
 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
 
 //! performs per-elements bit-wise inversion
 CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
@@ -854,6 +867,11 @@ CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float th
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
 
+//! HoughLinesP
+
+//! finds line segments in the black-n-white image using probabalistic Hough transform
+CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
+
 //! HoughCircles
 
 struct HoughCirclesBuf
@@ -912,11 +930,8 @@ CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuM
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports all matrices except 64F
 CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-
-//! computes norm of array
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports all matrices except 64F
 CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
+CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
 
 //! computes norm of the difference between two arrays
 //! supports NORM_INF, NORM_L1, NORM_L2
@@ -926,45 +941,33 @@ CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM
 //! computes sum of array elements
 //! supports only single channel images
 CV_EXPORTS Scalar sum(const GpuMat& src);
-
-//! computes sum of array elements
-//! supports only single channel images
 CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! computes sum of array elements absolute values
 //! supports only single channel images
 CV_EXPORTS Scalar absSum(const GpuMat& src);
-
-//! computes sum of array elements absolute values
-//! supports only single channel images
 CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! computes squared sum of array elements
 //! supports only single channel images
 CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-
-//! computes squared sum of array elements
-//! supports only single channel images
 CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! finds global minimum and maximum array elements and returns their values
 CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-
-//! finds global minimum and maximum array elements and returns their values
 CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
 
 //! finds global minimum and maximum array elements and returns their values with locations
 CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
                           const GpuMat& mask=GpuMat());
-
-//! finds global minimum and maximum array elements and returns their values with locations
 CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                           const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
 
 //! counts non-zero array elements
 CV_EXPORTS int countNonZero(const GpuMat& src);
-
-//! counts non-zero array elements
 CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
 
 //! reduces a matrix to a vector
@@ -1982,6 +1985,113 @@ private:
 };
 
 
+// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
+//
+// see reference:
+//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+class CV_EXPORTS OpticalFlowDual_TVL1_GPU
+{
+public:
+    OpticalFlowDual_TVL1_GPU();
+
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
+
+    void collectGarbage();
+
+    /**
+     * Time step of the numerical scheme.
+     */
+    double tau;
+
+    /**
+     * Weight parameter for the data term, attachment parameter.
+     * This is the most relevant parameter, which determines the smoothness of the output.
+     * The smaller this parameter is, the smoother the solutions we obtain.
+     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+     */
+    double lambda;
+
+    /**
+     * Weight parameter for (u - v)^2, tightness parameter.
+     * It serves as a link between the attachment and the regularization terms.
+     * In theory, it should have a small value in order to maintain both parts in correspondence.
+     * The method is stable for a large range of values of this parameter.
+     */
+    double theta;
+
+    /**
+     * Number of scales used to create the pyramid of images.
+     */
+    int nscales;
+
+    /**
+     * Number of warpings per scale.
+     * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
+     * This is a parameter that assures the stability of the method.
+     * It also affects the running time, so it is a compromise between speed and accuracy.
+     */
+    int warps;
+
+    /**
+     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
+     * A small value will yield more accurate solutions at the expense of a slower convergence.
+     */
+    double epsilon;
+
+    /**
+     * Stopping criterion iterations number used in the numerical scheme.
+     */
+    int iterations;
+
+    bool useInitialFlow;
+
+private:
+    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2);
+
+    std::vector<GpuMat> I0s;
+    std::vector<GpuMat> I1s;
+    std::vector<GpuMat> u1s;
+    std::vector<GpuMat> u2s;
+
+    GpuMat I1x_buf;
+    GpuMat I1y_buf;
+
+    GpuMat I1w_buf;
+    GpuMat I1wx_buf;
+    GpuMat I1wy_buf;
+
+    GpuMat grad_buf;
+    GpuMat rho_c_buf;
+
+    GpuMat p11_buf;
+    GpuMat p12_buf;
+    GpuMat p21_buf;
+    GpuMat p22_buf;
+
+    GpuMat diff_buf;
+    GpuMat norm_buf;
+};
+
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
+};
+
+
 //! Interpolate frames (images) using provided optical flow (displacement field).
 //! frame0   - frame 0 (32-bit floating point images, single channel)
 //! frame1   - frame 1 (the same type and size)
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index cfd572dc16..b97c4999cd 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -647,6 +647,39 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// CompareScalar
+
+PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_DEPTH, ALL_CMP_CODES))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int cmp_code = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    fillRandom(src);
+
+    cv::Scalar s = cv::Scalar::all(100);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        TEST_CYCLE() cv::gpu::compare(d_src, s, d_dst, cmp_code);
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::compare(src, s, dst, cmp_code);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // BitwiseNot
 
@@ -1598,7 +1631,7 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, d_buf);
+        TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, cv::gpu::GpuMat(), d_buf);
     }
     else
     {
@@ -1668,7 +1701,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::sum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::sum(d_src, cv::gpu::GpuMat(), d_buf);
     }
     else
     {
@@ -1703,7 +1736,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::absSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::absSum(d_src, cv::gpu::GpuMat(), d_buf);
 
         SANITY_CHECK(dst, 1e-6);
     }
@@ -1737,7 +1770,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, cv::gpu::GpuMat(), d_buf);
 
         SANITY_CHECK(dst, 1e-6);
     }
@@ -1893,4 +1926,48 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Normalize
+
+DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(NormType(cv::NORM_INF),
+           NormType(cv::NORM_L1),
+           NormType(cv::NORM_L2),
+           NormType(cv::NORM_MINMAX))
+    ))
+{
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int norm_type = GET_PARAM(2);
+
+    double alpha = 1;
+    double beta = 0;
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_norm_buf, d_cvt_buf;
+
+        TEST_CYCLE() cv::gpu::normalize(d_src, d_dst, alpha, beta, norm_type, type, cv::gpu::GpuMat(), d_norm_buf, d_cvt_buf);
+
+        GPU_SANITY_CHECK(d_dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::normalize(src, dst, alpha, beta, norm_type, type);
+
+        CPU_SANITY_CHECK(dst, 1);
+    }
+}
+
 } // namespace
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index 3f399cd5fe..e3d488ec94 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -1706,6 +1706,16 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S
 }
 
 namespace {
+    struct Vec4iComparator
+    {
+        bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
+        {
+            if (a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else if(a[2] != b[2]) return a[2] < b[2];
+            else return a[3] < b[3];
+        }
+    };
     struct Vec3fComparator
     {
         bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
@@ -1784,6 +1794,62 @@ PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, ImgProc_HoughLinesP, testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
+{
+    declare.time(30.0);
+
+    std::string fileName = getDataPath(GetParam());
+
+    const float rho = 1.0f;
+    const float theta = static_cast<float>(CV_PI / 180.0);
+    const int threshold = 100;
+    const int minLineLenght = 50;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    cv::Mat mask;
+    cv::Canny(image, mask, 50, 100);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+
+        cv::Mat h_lines(d_lines);
+        cv::Vec4i* begin = h_lines.ptr<cv::Vec4i>();
+        cv::Vec4i* end = h_lines.ptr<cv::Vec4i>() + h_lines.cols;
+        std::sort(begin, end, Vec4iComparator());
+        SANITY_CHECK(h_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+
+        std::sort(lines.begin(), lines.end(), Vec4iComparator());
+        SANITY_CHECK(lines);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // HoughCircles
 
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index b18cb17dfb..83213a1613 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -394,6 +394,173 @@ PERF_TEST_P(ImagePair, Video_FarnebackOpticalFlow,
     }
 }
 
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(20);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_flowx;
+        cv::gpu::GpuMat d_flowy;
+
+        cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
+
+        d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        TEST_CYCLE()
+        {
+            d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
+        }
+
+        GPU_SANITY_CHECK(d_flowx);
+        GPU_SANITY_CHECK(d_flowy);
+    }
+    else
+    {
+        cv::Mat flow;
+
+        cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+
+        alg->calc(frame0, frame1, flow);
+
+        TEST_CYCLE()
+        {
+            alg->calc(frame0, frame1, flow);
+        }
+
+        CPU_SANITY_CHECK(flow);
+    }
+}
+
+//////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+PERF_TEST_P(ImagePair, Video_OpticalFlowBM,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(400);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+
+        GPU_SANITY_CHECK(d_velx);
+        GPU_SANITY_CHECK(d_vely);
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE()
+        {
+            calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+        }
+
+        CPU_SANITY_CHECK(velx);
+        CPU_SANITY_CHECK(vely);
+    }
+}
+
+PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(400);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE()
+        {
+            fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
+        }
+
+        GPU_SANITY_CHECK(d_velx);
+        GPU_SANITY_CHECK(d_vely);
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE()
+        {
+            calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+        }
+
+        CPU_SANITY_CHECK(velx);
+        CPU_SANITY_CHECK(vely);
+    }
+}
+
 //////////////////////////////////////////////////////
 // FGDStatModel
 
diff --git a/modules/gpu/perf/utility.hpp b/modules/gpu/perf/utility.hpp
index 09b84f53aa..6782b93768 100644
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -17,7 +17,7 @@ CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONS
 
 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
 #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
 
 const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
 CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt
new file mode 100644
index 0000000000..7452203826
--- /dev/null
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video opencv_legacy opencv_gpu opencv_ts)
+
+ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})
+
+set(the_target gpu_perf4au)
+project(${the_target})
+
+ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
+
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
+endif()
+
+file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
+add_executable(${the_target} ${srcs})
+
+target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
+endif()
+
+if(WIN32)
+    if(MSVC AND NOT BUILD_SHARED_LIBS)
+        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+    endif()
+endif()
+
diff --git a/modules/gpu/perf4au/im1_1280x800.jpg b/modules/gpu/perf4au/im1_1280x800.jpg
new file mode 100644
index 0000000000..bdbbd4aee9
Binary files /dev/null and b/modules/gpu/perf4au/im1_1280x800.jpg differ
diff --git a/modules/gpu/perf4au/im2_1280x800.jpg b/modules/gpu/perf4au/im2_1280x800.jpg
new file mode 100644
index 0000000000..ae49640a95
Binary files /dev/null and b/modules/gpu/perf4au/im2_1280x800.jpg differ
diff --git a/modules/gpu/perf4au/main.cpp b/modules/gpu/perf4au/main.cpp
new file mode 100644
index 0000000000..80d97ea806
--- /dev/null
+++ b/modules/gpu/perf4au/main.cpp
@@ -0,0 +1,490 @@
+#include <cstdio>
+#ifdef HAVE_CVCONFIG_H
+#include "cvconfig.h"
+#endif
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/legacy/legacy.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+static void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined linux
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#endif
+}
+
+static void printCudaInfo()
+{
+    const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
+
+    printf("[----------]\n"); fflush(stdout);
+    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
+    printf("[----------]\n"); fflush(stdout);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        cv::gpu::DeviceInfo info(i);
+
+        printf("[----------]\n"); fflush(stdout);
+        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
+        printf("[          ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
+        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()); fflush(stdout);
+        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
+        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)); fflush(stdout);
+        if (!info.isCompatible())
+            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
+        printf("[----------]\n"); fflush(stdout);
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    printOsInfo();
+    printCudaInfo();
+
+    perf::Regression::Init("nv_perf_test");
+    perf::TestBase::Init(argc, argv);
+    testing::InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, HoughLinesP, testing::Values(std::string("im1_1280x800.jpg")))
+{
+    declare.time(30.0);
+
+    std::string fileName = GetParam();
+
+    const float rho = 1.f;
+    const float theta = 1.f;
+    const int threshold = 40;
+    const int minLineLenght = 20;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+    }
+    else
+    {
+        cv::Mat mask;
+        cv::Canny(image, mask, 50, 100);
+
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth);
+
+PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,
+                testing::Combine(
+                testing::Values(std::string("im1_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U)
+                ))
+{
+    declare.time(60);
+
+    const std::string fileName = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    if (src.empty())
+        FAIL() << "Unable to load source image [" << fileName << "]";
+
+    if (depth != CV_8U)
+        src.convertTo(src, depth);
+
+    cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
+
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_pts;
+
+        d_detector(d_src, d_pts, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_detector(d_src, d_pts, d_mask);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat pts;
+
+        cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+
+        TEST_CYCLE()
+        {
+            cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowPyrLKSparse
+
+typedef std::pair<std::string, std::string> string_pair;
+
+DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool);
+
+PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U),
+                    testing::Bool()
+                    ))
+{
+    declare.time(60);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+    const bool graySource = std::tr1::get<2>(GetParam());
+
+    // PyrLK params
+    const cv::Size winSize(15, 15);
+    const int maxLevel = 5;
+    const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01);
+
+    // GoodFeaturesToTrack params
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    cv::Mat gray_src;
+    if (graySource)
+        gray_src = src1;
+    else
+        cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY);
+
+    cv::Mat pts;
+    cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k);
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
+        cv::gpu::GpuMat d_nextPts;
+        cv::gpu::GpuMat d_status;
+
+        cv::gpu::PyrLKOpticalFlow d_pyrLK;
+        d_pyrLK.winSize = winSize;
+        d_pyrLK.maxLevel = maxLevel;
+        d_pyrLK.iters = criteria.maxCount;
+        d_pyrLK.useInitialFlow = false;
+
+        d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+
+        TEST_CYCLE()
+        {
+            d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat nextPts;
+        cv::Mat status;
+
+        cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+
+        TEST_CYCLE()
+        {
+            cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowFarneback
+
+DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth);
+
+PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U)
+                    ))
+{
+    declare.time(500);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const double pyrScale = 0.5;
+    const int numLevels = 6;
+    const int winSize = 7;
+    const int numIters = 15;
+    const int polyN = 7;
+    const double polySigma = 1.5;
+    const int flags = cv::OPTFLOW_USE_INITIAL_FLOW;
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0));
+        cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0));
+
+        cv::gpu::FarnebackOpticalFlow d_farneback;
+        d_farneback.pyrScale = pyrScale;
+        d_farneback.numLevels = numLevels;
+        d_farneback.winSize = winSize;
+        d_farneback.numIters = numIters;
+        d_farneback.polyN = polyN;
+        d_farneback.polySigma = polySigma;
+        d_farneback.flags = flags;
+
+        d_farneback(d_src1, d_src2, d_u, d_v);
+
+        TEST_CYCLE_N(10)
+        {
+            d_farneback(d_src1, d_src2, d_u, d_v);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0));
+
+        cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size);
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(2, 2)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(1, 1)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE_N(10)
+        {
+            fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 242febded9..7e0aaab680 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -59,6 +59,8 @@ void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { thr
 void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -529,4 +531,47 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
     polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
+////////////////////////////////////////////////////////////////////////
+// normalize
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+{
+    double scale = 1, shift = 0;
+    if (norm_type == NORM_MINMAX)
+    {
+        double smin = 0, smax = 0;
+        double dmin = std::min(a, b), dmax = std::max(a, b);
+        minMax(src, &smin, &smax, mask, norm_buf);
+        scale = (dmax - dmin) * (smax - smin > numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        shift = dmin - smin * scale;
+    }
+    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
+    {
+        scale = norm(src, norm_type, mask, norm_buf);
+        scale = scale > numeric_limits<double>::epsilon() ? a / scale : 0.0;
+        shift = 0;
+    }
+    else
+    {
+        CV_Error(CV_StsBadArg, "Unknown/unsupported norm type");
+    }
+
+    if (mask.empty())
+    {
+        src.convertTo(dst, dtype, scale, shift);
+    }
+    else
+    {
+        src.convertTo(cvt_buf, dtype, scale, shift);
+        cvt_buf.copyTo(dst, mask);
+    }
+}
+
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 4b52cc7dd3..27fb61ff70 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1954,6 +1954,226 @@ namespace arithm
     template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpScalar
+
+namespace arithm
+{
+#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
+
+    template <class Op, typename T, int cn> struct CmpScalar;
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
+    {
+        const T val;
+
+        __host__ explicit CmpScalar(T val_) : val(val_) {}
+
+        __device__ __forceinline__ uchar operator()(T src) const
+        {
+            Cmp<Op, T> op;
+            return op(src, val);
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+    {
+        const TYPE_VEC(T, 2) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+    {
+        const TYPE_VEC(T, 3) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+    {
+        const TYPE_VEC(T, 4) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
+        }
+    };
+
+#undef TYPE_VEC
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_t;
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
+        src_t val1 = VecTraits<src_t>::make(sval);
+
+        CmpScalar<Op<T>, T, cn> op(val1);
+        transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
+    }
+
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<equal_to, T, 1>,
+            cmpScalar<equal_to, T, 2>,
+            cmpScalar<equal_to, T, 3>,
+            cmpScalar<equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<not_equal_to, T, 1>,
+            cmpScalar<not_equal_to, T, 2>,
+            cmpScalar<not_equal_to, T, 3>,
+            cmpScalar<not_equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less, T, 1>,
+            cmpScalar<less, T, 2>,
+            cmpScalar<less, T, 3>,
+            cmpScalar<less, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less_equal, T, 1>,
+            cmpScalar<less_equal, T, 2>,
+            cmpScalar<less_equal, T, 3>,
+            cmpScalar<less_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater, T, 1>,
+            cmpScalar<greater, T, 2>,
+            cmpScalar<greater, T, 3>,
+            cmpScalar<greater, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater_equal, T, 1>,
+            cmpScalar<greater_equal, T, 2>,
+            cmpScalar<greater_equal, T, 3>,
+            cmpScalar<greater_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+
+    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
 //////////////////////////////////////////////////////////////////////////////////////
 // bitMat
 
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index 8e35aa858e..695a47def4 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -293,6 +293,201 @@ namespace cv { namespace gpu { namespace device
             return totalCount;
         }
 
+        ////////////////////////////////////////////////////////////////////////
+        // houghLinesProbabilistic
+
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
+                                                int4* out, const int maxSize,
+                                                const float rho, const float theta,
+                                                const int lineGap, const int lineLength,
+                                                const int rows, const int cols)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes >= lineLength &&
+                curVotes > accum(n, r) &&
+                curVotes > accum(n, r + 1) &&
+                curVotes > accum(n, r + 2) &&
+                curVotes > accum(n + 1, r) &&
+                curVotes > accum(n + 1, r + 2) &&
+                curVotes > accum(n + 2, r) &&
+                curVotes > accum(n + 2, r + 1) &&
+                curVotes > accum(n + 2, r + 2))
+            {
+                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                float cosa;
+                float sina;
+                sincosf(angle, &sina, &cosa);
+
+                float2 p0 = make_float2(cosa * radius, sina * radius);
+                float2 dir = make_float2(-sina, cosa);
+
+                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
+                float a;
+
+                if (dir.x != 0)
+                {
+                    a = -p0.x / dir.x;
+                    pb[0].x = 0;
+                    pb[0].y = p0.y + a * dir.y;
+
+                    a = (cols - 1 - p0.x) / dir.x;
+                    pb[1].x = cols - 1;
+                    pb[1].y = p0.y + a * dir.y;
+                }
+                if (dir.y != 0)
+                {
+                    a = -p0.y / dir.y;
+                    pb[2].x = p0.x + a * dir.x;
+                    pb[2].y = 0;
+
+                    a = (rows - 1 - p0.y) / dir.y;
+                    pb[3].x = p0.x + a * dir.x;
+                    pb[3].y = rows - 1;
+                }
+
+                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[0];
+                    if (dir.x < 0)
+                        dir = -dir;
+                }
+                else if (pb[1].x == cols - 1 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[1];
+                    if (dir.x > 0)
+                        dir = -dir;
+                }
+                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
+                {
+                    p0 = pb[2];
+                    if (dir.y < 0)
+                        dir = -dir;
+                }
+                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
+                {
+                    p0 = pb[3];
+                    if (dir.y > 0)
+                        dir = -dir;
+                }
+
+                float2 d;
+                if (::fabsf(dir.x) > ::fabsf(dir.y))
+                {
+                    d.x = dir.x > 0 ? 1 : -1;
+                    d.y = dir.y / ::fabsf(dir.x);
+                }
+                else
+                {
+                    d.x = dir.x / ::fabsf(dir.y);
+                    d.y = dir.y > 0 ? 1 : -1;
+                }
+
+                float2 line_end[2];
+                int gap;
+                bool inLine = false;
+
+                float2 p1 = p0;
+                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    return;
+
+                for (;;)
+                {
+                    if (tex2D(tex_mask, p1.x, p1.y))
+                    {
+                        gap = 0;
+
+                        if (!inLine)
+                        {
+                            line_end[0] = p1;
+                            line_end[1] = p1;
+                            inLine = true;
+                        }
+                        else
+                        {
+                            line_end[1] = p1;
+                        }
+                    }
+                    else if (inLine)
+                    {
+                        if (++gap > lineGap)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                            gap = 0;
+                            inLine = false;
+                        }
+                    }
+
+                    p1 = p1 + d;
+                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    {
+                        if (inLine)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            bindTexture(&tex_mask, mask);
+
+            houghLinesProbabilistic<<<grid, block>>>(accum,
+                                                     out, maxSize,
+                                                     rho, theta,
+                                                     lineGap, lineLength,
+                                                     mask.rows, mask.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
         ////////////////////////////////////////////////////////////////////////
         // circlesAccumCenters
 
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index b48c47e6a2..745daca1db 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -352,8 +352,8 @@ namespace sum
         }
     };
 
-    template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
-    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
+    template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
+    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
     {
         typedef typename VecTraits<src_type>::elem_type T;
         typedef typename VecTraits<result_type>::elem_type R;
@@ -375,9 +375,11 @@ namespace sum
 
             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
             {
-                const src_type srcVal = ptr[x];
-
-                sum = sum + op(saturate_cast<result_type>(srcVal));
+                if (mask(y, x))
+                {
+                    const src_type srcVal = ptr[x];
+                    sum = sum + op(saturate_cast<result_type>(srcVal));
+                }
             }
         }
 
@@ -410,7 +412,7 @@ namespace sum
     }
 
     template <typename T, typename R, int cn, template <typename> class Op>
-    void caller(PtrStepSzb src_, void* buf_, double* out)
+    void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
     {
         typedef typename TypeVec<T, cn>::vec_type src_type;
         typedef typename TypeVec<R, cn>::vec_type result_type;
@@ -426,7 +428,10 @@ namespace sum
 
         Op<result_type> op;
 
-        kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaDeviceSynchronize() );
@@ -450,88 +455,88 @@ namespace sum
     template <> struct SumType<double> { typedef double R; };
 
     template <typename T, int cn>
-    void run(PtrStepSzb src, void* buf, double* out)
+    void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
         typedef typename SumType<T>::R R;
-        caller<T, R, cn, identity>(src, buf, out);
+        caller<T, R, cn, identity>(src, buf, out, mask);
     }
 
-    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runAbs(PtrStepSzb src, void* buf, double* out)
+    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
         typedef typename SumType<T>::R R;
-        caller<T, R, cn, abs_func>(src, buf, out);
+        caller<T, R, cn, abs_func>(src, buf, out, mask);
     }
 
-    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
     template <typename T> struct Sqr : unary_function<T, T>
     {
@@ -542,45 +547,45 @@ namespace sum
     };
 
     template <typename T, int cn>
-    void runSqr(PtrStepSzb src, void* buf, double* out)
+    void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
-        caller<T, double, cn, Sqr>(src, buf, out);
+        caller<T, double, cn, Sqr>(src, buf, out, mask);
     }
 
-    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 }
 
 /////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/cuda/optflowbm.cu b/modules/gpu/src/cuda/optflowbm.cu
new file mode 100644
index 0000000000..baf8dfb362
--- /dev/null
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -0,0 +1,414 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace optflowbm
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    {
+        int s = 0;
+
+        for (int y = 0; y < blockSize.y; ++y)
+        {
+            for (int x = 0; x < blockSize.x; ++x)
+                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+        }
+
+        return s;
+    }
+
+    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
+                                  const short2* ss, const int ssCount)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i >= velx.rows || j >= velx.cols)
+            return;
+
+        const int X1 = j * shiftSize.x;
+        const int Y1 = i * shiftSize.y;
+
+        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
+        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
+
+        int X2 = X1 + offX;
+        int Y2 = Y1 + offY;
+
+        int dist = numeric_limits<int>::max();
+
+        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+
+        int countMin = 1;
+        int sumx = offX;
+        int sumy = offY;
+
+        if (dist > acceptLevel)
+        {
+            // do brute-force search
+            for (int k = 0; k < ssCount; ++k)
+            {
+                const short2 ssVal = ss[k];
+
+                const int dx = offX + ssVal.x;
+                const int dy = offY + ssVal.y;
+
+                X2 = X1 + dx;
+                Y2 = Y1 + dy;
+
+                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+                {
+                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    if (tmpDist < acceptLevel)
+                    {
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                        break;
+                    }
+
+                    if (tmpDist < dist)
+                    {
+                        dist = tmpDist;
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                    }
+                    else if (tmpDist == dist)
+                    {
+                        sumx += dx;
+                        sumy += dy;
+                        countMin++;
+                    }
+                }
+            }
+
+            if (dist > escapeLevel)
+            {
+                sumx = offX;
+                sumy = offY;
+                countMin = 1;
+            }
+        }
+
+        velx(i, j) = static_cast<float>(sumx) / countMin;
+        vely(i, j) = static_cast<float>(sumy) / countMin;
+    }
+
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
+    {
+        bindTexture(&tex_prev, prev);
+        bindTexture(&tex_curr, curr);
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
+
+        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
+                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////
+// Fast approximate version
+
+namespace optflowbm_fast
+{
+    enum
+    {
+        CTA_SIZE = 128,
+
+        TILE_COLS = 128,
+        TILE_ROWS = 32,
+
+        STRIDE = CTA_SIZE
+    };
+
+    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
+    {
+        return ::abs(a - b);
+    }
+
+    template <class T> struct FastOptFlowBM
+    {
+
+        int search_radius;
+        int block_radius;
+
+        int search_window;
+        int block_window;
+
+        PtrStepSz<T> I0;
+        PtrStep<T> I1;
+
+        mutable PtrStepi buffer;
+
+        FastOptFlowBM(int search_window_, int block_window_,
+                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
+                      PtrStepi buffer_) :
+            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
+            search_window(search_window_), block_window(block_window_),
+            I0(I0_), I1(I1_),
+            buffer(buffer_)
+        {
+        }
+
+        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                dist_sums[index] = 0;
+
+                for (int tx = 0; tx < block_window; ++tx)
+                    col_sums(tx, index) = 0;
+
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius;
+
+                for (int tx = -block_radius; tx <= block_radius; ++tx)
+                {
+                    int col_sum = 0;
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    {
+                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
+
+                        dist_sums[index] += dist;
+                        col_sum += dist;
+                    }
+
+                    col_sums(tx + block_radius, index) = col_sum;
+                }
+
+                up_col_sums(j, index) = col_sums(block_window - 1, index);
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j + block_radius;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                int col_sum = 0;
+
+                for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
+
+                dist_sums[index] += col_sum - col_sums(first, index);
+
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            int ay = i;
+            int ax = j + block_radius;
+
+            T a_up   = I0(ay - block_radius - 1, ax);
+            T a_down = I0(ay + block_radius, ax);
+
+            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                T b_up   = I1(by - block_radius - 1, bx);
+                T b_down = I1(by + block_radius, bx);
+
+                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                dist_sums[index] += col_sum  - col_sums(first, index);
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
+        {
+            int bestDist = numeric_limits<int>::max();
+            int bestInd = -1;
+
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int curDist = dist_sums[index];
+                if (curDist < bestDist)
+                {
+                    bestDist = curDist;
+                    bestInd = index;
+                }
+            }
+
+            __shared__ int cta_dist_buffer[CTA_SIZE];
+            __shared__ int cta_ind_buffer[CTA_SIZE];
+
+            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
+
+            if (threadIdx.x == 0)
+            {
+                int y = bestInd / search_window;
+                int x = bestInd - y * search_window;
+
+                velx = x - search_radius;
+                vely = y - search_radius;
+            }
+        }
+
+        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
+        {
+            int tbx = blockIdx.x * TILE_COLS;
+            int tby = blockIdx.y * TILE_ROWS;
+
+            int tex = ::min(tbx + TILE_COLS, I0.cols);
+            int tey = ::min(tby + TILE_ROWS, I0.rows);
+
+            PtrStepi col_sums;
+            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+            col_sums.step = buffer.step;
+
+            PtrStepi up_col_sums;
+            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+            up_col_sums.step = buffer.step;
+
+            extern __shared__ int dist_sums[]; //search_window * search_window
+
+            int first = 0;
+
+            for (int i = tby; i < tey; ++i)
+            {
+                for (int j = tbx; j < tex; ++j)
+                {
+                    __syncthreads();
+
+                    if (j == tbx)
+                    {
+                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                        first = 0;
+                    }
+                    else
+                    {
+                        if (i == tby)
+                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                        else
+                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                        first = (first + 1) % block_window;
+                    }
+
+                    __syncthreads();
+
+                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
+                }
+            }
+        }
+
+    };
+
+    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
+    {
+        fbm(velx, vely);
+    }
+
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+    {
+        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
+
+        buffer_cols = search_window * search_window * grid.y;
+        buffer_rows = src_cols + block_window * grid.x;
+    }
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
+    {
+        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
+
+        dim3 block(CTA_SIZE, 1);
+        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
+
+        size_t smem = search_window * search_window * sizeof(int);
+
+        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
+        cudaSafeCall ( cudaGetLastError () );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/gpu/src/cuda/tvl1flow.cu b/modules/gpu/src/cuda/tvl1flow.cu
new file mode 100644
index 0000000000..27694ad269
--- /dev/null
+++ b/modules/gpu/src/cuda/tvl1flow.cu
@@ -0,0 +1,332 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+namespace tvl1flow
+{
+    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
+        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
+    }
+
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// warpBackward
+
+namespace tvl1flow
+{
+    static __device__ __forceinline__ float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I0.cols || y >= I0.rows)
+            return;
+
+        const float u1Val = u1(y, x);
+        const float u2Val = u2(y, x);
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ::ceilf(wx - 2.0f);
+        const int xmax = ::floorf(wx + 2.0f);
+
+        const int ymin = ::ceilf(wy - 2.0f);
+        const int ymax = ::floorf(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                sum  += w * tex2D(tex_I1 , cx, cy);
+                sumx += w * tex2D(tex_I1x, cx, cy);
+                sumy += w * tex2D(tex_I1y, cx, cy);
+
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w(y, x)  = I1wVal;
+        I1wx(y, x) = I1wxVal;
+        I1wy(y, x) = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad(y, x) = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0(y, x);
+        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
+
+        bindTexture(&tex_I1 , I1);
+        bindTexture(&tex_I1x, I1x);
+        bindTexture(&tex_I1y, I1y);
+
+        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+namespace tvl1flow
+{
+    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
+    {
+        if (x > 0 && y > 0)
+        {
+            const float v1x = v1(y, x) - v1(y, x - 1);
+            const float v2y = v2(y, x) - v2(y - 1, x);
+            return v1x + v2y;
+        }
+        else
+        {
+            if (y > 0)
+                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+            else
+            {
+                if (x > 0)
+                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
+                else
+                    return v1(0, 0) + v2(0, 0);
+            }
+        }
+    }
+
+    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
+                              const PtrStepf grad, const PtrStepf rho_c,
+                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
+                              PtrStepf u1, PtrStepf u2, PtrStepf error,
+                              const float l_t, const float theta)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I1wx.cols || y >= I1wx.rows)
+            return;
+
+        const float I1wxVal = I1wx(y, x);
+        const float I1wyVal = I1wy(y, x);
+        const float gradVal = grad(y, x);
+        const float u1OldVal = u1(y, x);
+        const float u2OldVal = u2(y, x);
+
+        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > numeric_limits<float>::epsilon())
+        {
+            const float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+
+        const float v1 = u1OldVal + d1;
+        const float v2 = u2OldVal + d2;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        const float div_p1 = divergence(p11, p12, y, x);
+        const float div_p2 = divergence(p21, p22, y, x);
+
+        // estimate the values of the optical flow (u1, u2)
+
+        const float u1NewVal = v1 + theta * div_p1;
+        const float u2NewVal = v2 + theta * div_p2;
+
+        u1(y, x) = u1NewVal;
+        u2(y, x) = u2NewVal;
+
+        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+        error(y, x) = n1 + n2;
+    }
+
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
+
+        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+namespace tvl1flow
+{
+    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= u1.cols || y >= u1.rows)
+            return;
+
+        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
+        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
+
+        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
+        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
+
+        const float g1 = ::hypotf(u1x, u1y);
+        const float g2 = ::hypotf(u2x, u2y);
+
+        const float ng1 = 1.0f + taut * g1;
+        const float ng2 = 1.0f + taut * g2;
+
+        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
+        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
+        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
+        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
+    }
+
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
+
+        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index 5e6e4c3ea0..f9fbe820eb 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -42,51 +42,37 @@
 
 #include "precomp.hpp"
 
+using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#if defined HAVE_CUDA
-
-struct Stream::Impl
-{
-    static cudaStream_t getStream(const Impl* impl) { return impl ? impl->stream : 0; }
-    cudaStream_t stream;
-    int ref_counter;
-};
-
-#include "opencv2/gpu/stream_accessor.hpp"
-
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
-{
-    return Stream::Impl::getStream(stream.impl);
-};
-
-#endif /* !defined (HAVE_CUDA) */
-
-
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::Stream::create() { throw_nogpu(); }
-void cv::gpu::Stream::release() { throw_nogpu(); }
-cv::gpu::Stream::Stream() : impl(0) { throw_nogpu(); }
-cv::gpu::Stream::~Stream() { throw_nogpu(); }
-cv::gpu::Stream::Stream(const Stream& /*stream*/) { throw_nogpu(); }
-Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
+cv::gpu::Stream::Stream() { throw_nogpu(); }
+cv::gpu::Stream::~Stream() {}
+cv::gpu::Stream::Stream(const Stream&) { throw_nogpu(); }
+Stream& cv::gpu::Stream::operator=(const Stream&) { throw_nogpu(); return *this; }
+bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return false; }
 void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_nogpu(); }
 Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
 cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
+cv::gpu::Stream::Stream(Impl*) { throw_nogpu(); }
+void cv::gpu::Stream::create() { throw_nogpu(); }
+void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
+#include "opencv2/gpu/stream_accessor.hpp"
+
 namespace cv { namespace gpu
 {
     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
@@ -95,14 +81,247 @@ namespace cv { namespace gpu
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
 }}
 
+struct Stream::Impl
+{
+    static cudaStream_t getStream(const Impl* impl)
+    {
+        return impl ? impl->stream : 0;
+    }
+
+    cudaStream_t stream;
+    int ref_counter;
+};
+
+cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return Stream::Impl::getStream(stream.impl);
+}
+
+cv::gpu::Stream::Stream() : impl(0)
+{
+    create();
+}
+
+cv::gpu::Stream::~Stream()
+{
+    release();
+}
+
+cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
+{
+    if (impl)
+        CV_XADD(&impl->ref_counter, 1);
+}
+
+Stream& cv::gpu::Stream::operator =(const Stream& stream)
+{
+    if (this != &stream)
+    {
+        release();
+        impl = stream.impl;
+        if (impl)
+            CV_XADD(&impl->ref_counter, 1);
+    }
+
+    return *this;
+}
+
+bool cv::gpu::Stream::queryIfComplete()
+{
+    cudaStream_t stream = Impl::getStream(impl);
+    cudaError_t err = cudaStreamQuery(stream);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+}
+
+void cv::gpu::Stream::waitForCompletion()
+{
+    cudaStream_t stream = Impl::getStream(impl);
+    cudaSafeCall( cudaStreamSynchronize(stream) );
+}
+
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
+{
+    // if not -> allocation will be done, but after that dst will not point to page locked memory
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
+}
+
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
+{
+    dst.create(src.size(), src.type(), CudaMem::ALLOC_PAGE_LOCKED);
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
+}
+
+void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
+{
+    const int sdepth = src.depth();
+
+    if (sdepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    if (val[0] == 0.0 && val[1] == 0.0 && val[2] == 0.0 && val[3] == 0.0)
+    {
+        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, stream) );
+        return;
+    }
+
+    if (sdepth == CV_8U)
+    {
+        int cn = src.channels();
+
+        if (cn == 1 || (cn == 2 && val[0] == val[1]) || (cn == 3 && val[0] == val[1] && val[0] == val[2]) || (cn == 4 && val[0] == val[1] && val[0] == val[2] && val[0] == val[3]))
+        {
+            int ival = saturate_cast<uchar>(val[0]);
+            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, ival, src.cols * src.elemSize(), src.rows, stream) );
+            return;
+        }
+    }
+
+    setTo(src, val, stream);
+}
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
+{
+    const int sdepth = src.depth();
+
+    if (sdepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    CV_Assert(mask.type() == CV_8UC1);
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    setTo(src, val, mask, stream);
+}
+
+void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta)
+{
+    if (dtype < 0)
+        dtype = src.type();
+    else
+        dtype = CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels());
+
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
+
+    if (sdepth == ddepth && noScale)
+    {
+        enqueueCopy(src, dst);
+        return;
+    }
+
+    dst.create(src.size(), dtype);
+
+    cudaStream_t stream = Impl::getStream(impl);
+    convertTo(src, dst, alpha, beta, stream);
+}
+
+#if CUDA_VERSION >= 5000
+
 namespace
 {
-    template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
+    struct CallbackData
     {
-        dst.create(src.size(), src.type());
-        size_t bwidth = src.cols * src.elemSize();
-        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
+        cv::gpu::Stream::StreamCallback callback;
+        void* userData;
+        Stream stream;
     };
+
+    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
+    {
+        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
+        data->callback(data->stream, static_cast<int>(status), data->userData);
+        delete data;
+    }
+}
+
+#endif
+
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+{
+#if CUDA_VERSION >= 5000
+    CallbackData* data = new CallbackData;
+    data->callback = callback;
+    data->userData = userData;
+    data->stream = *this;
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) );
+#else
+    (void) callback;
+    (void) userData;
+    CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0");
+#endif
+}
+
+cv::gpu::Stream& cv::gpu::Stream::Null()
+{
+    static Stream s((Impl*) 0);
+    return s;
+}
+
+cv::gpu::Stream::operator bool() const
+{
+    return impl && impl->stream;
+}
+
+cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_)
+{
 }
 
 void cv::gpu::Stream::create()
@@ -113,7 +332,7 @@ void cv::gpu::Stream::create()
     cudaStream_t stream;
     cudaSafeCall( cudaStreamCreate( &stream ) );
 
-    impl = (Stream::Impl*)fastMalloc(sizeof(Stream::Impl));
+    impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl));
 
     impl->stream = stream;
     impl->ref_counter = 1;
@@ -121,133 +340,11 @@ void cv::gpu::Stream::create()
 
 void cv::gpu::Stream::release()
 {
-    if( impl && CV_XADD(&impl->ref_counter, -1) == 1 )
+    if (impl && CV_XADD(&impl->ref_counter, -1) == 1)
     {
-        cudaSafeCall( cudaStreamDestroy( impl->stream ) );
-        cv::fastFree( impl );
+        cudaSafeCall( cudaStreamDestroy(impl->stream) );
+        cv::fastFree(impl);
     }
 }
 
-cv::gpu::Stream::Stream() : impl(0) { create(); }
-cv::gpu::Stream::~Stream() { release(); }
-
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
-{
-    if( impl )
-        CV_XADD(&impl->ref_counter, 1);
-}
-Stream& cv::gpu::Stream::operator=(const Stream& stream)
-{
-    if( this != &stream )
-    {
-        if( stream.impl )
-            CV_XADD(&stream.impl->ref_counter, 1);
-
-        release();
-        impl = stream.impl;
-    }
-    return *this;
-}
-
-bool cv::gpu::Stream::queryIfComplete()
-{
-    cudaError_t err = cudaStreamQuery( Impl::getStream(impl) );
-
-    if (err == cudaErrorNotReady || err == cudaSuccess)
-        return err == cudaSuccess;
-
-    cudaSafeCall(err);
-    return false;
-}
-
-void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( Impl::getStream(impl) ) ); }
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
-    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
-}
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToDevice); }
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
-{
-    CV_Assert((src.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-    {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-        return;
-    }
-    if (src.depth() == CV_8U)
-    {
-        int cn = src.channels();
-
-        if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-        {
-            int val = saturate_cast<uchar>(s[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-            return;
-        }
-    }
-
-    setTo(src, s, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    CV_Assert((src.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    CV_Assert(mask.type() == CV_8UC1);
-
-    setTo(src, val, mask, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
-{
-    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if( rtype < 0 )
-        rtype = src.type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels());
-
-    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
-    if( sdepth == ddepth && noScale )
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    GpuMat temp;
-    const GpuMat* psrc = &src;
-    if( sdepth != ddepth && psrc == &dst )
-        psrc = &(temp = src);
-
-    dst.create( src.size(), rtype );
-    convertTo(src, dst, alpha, beta, Impl::getStream(impl));
-}
-
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
-
-cv::gpu::Stream& cv::gpu::Stream::Null()
-{
-    static Stream s((Impl*)0);
-    return s;
-}
-
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index b34b5ca426..3d6cde377c 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -64,6 +64,7 @@ void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
+void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -2001,6 +2002,69 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     func(src1_, src2_, dst_, stream);
 }
 
+namespace arithm
+{
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
+namespace
+{
+    template <typename T> void castScalar(Scalar& sc)
+    {
+        sc.val[0] = saturate_cast<T>(sc.val[0]);
+        sc.val[1] = saturate_cast<T>(sc.val[1]);
+        sc.val[2] = saturate_cast<T>(sc.val[2]);
+        sc.val[3] = saturate_cast<T>(sc.val[3]);
+    }
+}
+
+void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[7][6] =
+    {
+        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+        {cmpScalarEq<signed char>   , cmpScalarGt<signed char>   , cmpScalarGe<signed char>   , cmpScalarLt<signed char>   , cmpScalarLe<signed char>   , cmpScalarNe<signed char>   },
+        {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
+        {cmpScalarEq<short>         , cmpScalarGt<short>         , cmpScalarGe<short>         , cmpScalarLt<short>         , cmpScalarLe<short>         , cmpScalarNe<short>         },
+        {cmpScalarEq<int>           , cmpScalarGt<int>           , cmpScalarGe<int>           , cmpScalarLt<int>           , cmpScalarLe<int>           , cmpScalarNe<int>           },
+        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
+        {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
+    };
+
+    typedef void (*cast_func_t)(Scalar& sc);
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+    cast_func[depth](sc);
+
+    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index 888c325c97..fecb717cd9 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -52,6 +52,8 @@ void cv::gpu::HoughLines(const GpuMat&, GpuMat&, float, float, int, bool, int) {
 void cv::gpu::HoughLines(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, bool, int) { throw_nogpu(); }
 void cv::gpu::HoughLinesDownload(const GpuMat&, OutputArray, OutputArray) { throw_nogpu(); }
 
+void cv::gpu::HoughLinesP(const GpuMat&, GpuMat&, HoughLinesBuf&, float, float, int, int, int) { throw_nogpu(); }
+
 void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
 void cv::gpu::HoughCircles(const GpuMat&, GpuMat&, HoughCirclesBuf&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
 void cv::gpu::HoughCirclesDownload(const GpuMat&, OutputArray) { throw_nogpu(); }
@@ -157,6 +159,57 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
     }
 }
 
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace hough
+    {
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+    }
+}}}
+
+void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
+{
+    using namespace cv::gpu::device::hough;
+
+    CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+    CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.list);
+    unsigned int* srcPoints = buf.list.ptr<unsigned int>();
+
+    const int pointsCount = buildPointList_gpu(src, srcPoints);
+    if (pointsCount == 0)
+    {
+        lines.release();
+        return;
+    }
+
+    const int numangle = cvRound(CV_PI / theta);
+    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
+    CV_Assert( numangle > 0 && numrho > 0 );
+
+    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, buf.accum);
+    buf.accum.setTo(Scalar::all(0));
+
+    DeviceInfo devInfo;
+    cudaDeviceProp prop;
+    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+    ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
+
+    int linesCount = houghLinesProbabilistic_gpu(src, buf.accum, lines.ptr<int4>(), maxLines, rho, theta, maxLineGap, minLineLength);
+
+    if (linesCount > 0)
+        lines.cols = linesCount;
+    else
+        lines.release();
+}
+
 //////////////////////////////////////////////////////////
 // HoughCircles
 
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 4295644c7a..67e65fc585 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -51,13 +51,17 @@ void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_nogpu(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_nogpu(); return 0.0; }
+double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
@@ -150,24 +154,30 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 double cv::gpu::norm(const GpuMat& src, int normType)
 {
     GpuMat buf;
-    return norm(src, normType, buf);
+    return norm(src, normType, GpuMat(), buf);
 }
 
 double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
+{
+    return norm(src, normType, GpuMat(), buf);
+}
+
+double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
 {
     CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
 
     GpuMat src_single_channel = src.reshape(1);
 
     if (normType == NORM_L1)
-        return absSum(src_single_channel, buf)[0];
+        return absSum(src_single_channel, mask, buf)[0];
 
     if (normType == NORM_L2)
-        return std::sqrt(sqrSum(src_single_channel, buf)[0]);
+        return std::sqrt(sqrSum(src_single_channel, mask, buf)[0]);
 
     // NORM_INF
     double min_val, max_val;
-    minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
+    minMax(src_single_channel, &min_val, &max_val, mask, buf);
     return std::max(std::abs(min_val), std::abs(max_val));
 }
 
@@ -209,24 +219,29 @@ namespace sum
     void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
 
     template <typename T, int cn>
-    void run(PtrStepSzb src, void* buf, double* sum);
+    void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runAbs(PtrStepSzb src, void* buf, double* sum);
+    void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runSqr(PtrStepSzb src, void* buf, double* sum);
+    void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 }
 
 Scalar cv::gpu::sum(const GpuMat& src)
 {
     GpuMat buf;
-    return sum(src, buf);
+    return sum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return sum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
@@ -238,6 +253,8 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -252,7 +269,7 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
@@ -260,12 +277,17 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::absSum(const GpuMat& src)
 {
     GpuMat buf;
-    return absSum(src, buf);
+    return absSum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return absSum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
@@ -277,6 +299,8 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -291,7 +315,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
@@ -299,12 +323,17 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
     GpuMat buf;
-    return sqrSum(src, buf);
+    return sqrSum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return sqrSum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
@@ -316,6 +345,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -330,7 +361,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
diff --git a/modules/gpu/src/optflowbm.cpp b/modules/gpu/src/optflowbm.cpp
new file mode 100644
index 0000000000..a4321c89cc
--- /dev/null
+++ b/modules/gpu/src/optflowbm.cpp
@@ -0,0 +1,243 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm
+{
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
+}
+
+void cv::gpu::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
+{
+    CV_Assert( prev.type() == CV_8UC1 );
+    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
+
+    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
+                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
+
+    velx.create(velSize, CV_32FC1);
+    vely.create(velSize, CV_32FC1);
+
+    // scanning scheme coordinates
+    vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
+    int ssCount = 0;
+
+    // Calculate scanning scheme
+    const int minCount = std::min(maxRange.width, maxRange.height);
+
+    // use spiral search pattern
+    //
+    //     9 10 11 12
+    //     8  1  2 13
+    //     7  *  3 14
+    //     6  5  4 15
+    //... 20 19 18 17
+    //
+
+    for (int i = 0; i < minCount; ++i)
+    {
+        // four cycles along sides
+        int x = -i - 1, y = x;
+
+        // upper side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = ++x;
+            ss[ssCount].y = y;
+        }
+
+        // right side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = x;
+            ss[ssCount].y = ++y;
+        }
+
+        // bottom side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = --x;
+            ss[ssCount].y = y;
+        }
+
+        // left side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = x;
+            ss[ssCount].y = --y;
+        }
+    }
+
+    // the rest part
+    if (maxRange.width < maxRange.height)
+    {
+        const int xleft = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.height; ++i)
+        {
+            // two cycles by x
+            int y = -(i + 1);
+            int x = xleft;
+
+            // upper side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+
+            x = xleft;
+            y = -y;
+
+            // bottom side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+        }
+    }
+    else if (maxRange.width > maxRange.height)
+    {
+        const int yupper = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.width; ++i)
+        {
+            // two cycles by y
+            int x = -(i + 1);
+            int y = yupper;
+
+            // left side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+
+            y = yupper;
+            x = -x;
+
+            // right side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = x;
+                ss[ssCount].y = y;
+            }
+        }
+    }
+
+    const cudaStream_t stream = StreamAccessor::getStream(st);
+
+    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
+    if (stream == 0)
+        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
+    else
+        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
+
+    const int maxX = prev.cols - blockSize.width;
+    const int maxY = prev.rows - blockSize.height;
+
+    const int SMALL_DIFF = 2;
+    const int BIG_DIFF = 128;
+
+    const int blSize = blockSize.area();
+    const int acceptLevel = blSize * SMALL_DIFF;
+    const int escapeLevel = blSize * BIG_DIFF;
+
+    optflowbm::calc(prev, curr, velx, vely,
+                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
+                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
+}
+
+namespace optflowbm_fast
+{
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
+{
+    CV_Assert( I0.type() == CV_8UC1 );
+    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
+
+    int border_size = search_window / 2 + block_window / 2;
+    Size esize = I0.size() + Size(border_size, border_size) * 2;
+
+    ensureSizeIsEnough(esize, I0.type(), extended_I0);
+    ensureSizeIsEnough(esize, I0.type(), extended_I1);
+
+    copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+    copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+
+    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
+    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
+
+    int bcols, brows;
+    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
+
+    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
+
+    flowx.create(I0.size(), CV_32FC1);
+    flowy.create(I0.size(), CV_32FC1);
+
+    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/gpu/src/tvl1flow.cpp b/modules/gpu/src/tvl1flow.cpp
new file mode 100644
index 0000000000..a598a9ecf0
--- /dev/null
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -0,0 +1,256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_nogpu(); }
+void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() {}
+void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+
+#else
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    iterations     = 300;
+    useInitialFlow = false;
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
+{
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+    CV_Assert( nscales > 0 );
+
+    // allocate memory for the pyramid structure
+    I0s.resize(nscales);
+    I1s.resize(nscales);
+    u1s.resize(nscales);
+    u2s.resize(nscales);
+
+    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+
+    if (!useInitialFlow)
+    {
+        flowx.create(I0.size(), CV_32FC1);
+        flowy.create(I0.size(), CV_32FC1);
+    }
+
+    u1s[0] = flowx;
+    u2s[0] = flowy;
+
+    I1x_buf.create(I0.size(), CV_32FC1);
+    I1y_buf.create(I0.size(), CV_32FC1);
+
+    I1w_buf.create(I0.size(), CV_32FC1);
+    I1wx_buf.create(I0.size(), CV_32FC1);
+    I1wy_buf.create(I0.size(), CV_32FC1);
+
+    grad_buf.create(I0.size(), CV_32FC1);
+    rho_c_buf.create(I0.size(), CV_32FC1);
+
+    p11_buf.create(I0.size(), CV_32FC1);
+    p12_buf.create(I0.size(), CV_32FC1);
+    p21_buf.create(I0.size(), CV_32FC1);
+    p22_buf.create(I0.size(), CV_32FC1);
+
+    diff_buf.create(I0.size(), CV_32FC1);
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        gpu::pyrDown(I0s[s - 1], I0s[s]);
+        gpu::pyrDown(I1s[s - 1], I1s[s]);
+
+        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            gpu::pyrDown(u1s[s - 1], u1s[s]);
+            gpu::pyrDown(u2s[s - 1], u2s[s]);
+
+            gpu::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
+            gpu::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
+        }
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // otherwise, upsample the optical flow
+
+        // zoom the optical flow for the next finer scale
+        gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
+        gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        gpu::multiply(u1s[s - 1], Scalar::all(2), u1s[s - 1]);
+        gpu::multiply(u2s[s - 1], Scalar::all(2), u2s[s - 1]);
+    }
+}
+
+namespace tvl1flow
+{
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
+{
+    using namespace tvl1flow;
+
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+
+    if (u1.empty())
+    {
+        u1.create(I0.size(), CV_32FC1);
+        u1.setTo(Scalar::all(0));
+
+        u2.create(I0.size(), CV_32FC1);
+        u2.setTo(Scalar::all(0));
+    }
+
+    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    centeredGradient(I1, I1x, I1y);
+
+    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
+
+        double error = numeric_limits<double>::max();
+        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        {
+            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
+
+            error = gpu::sum(diff, norm_buf)[0];
+
+            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
+        }
+    }
+}
+
+void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
+
+    I1x_buf.release();
+    I1y_buf.release();
+
+    I1w_buf.release();
+    I1wx_buf.release();
+    I1wy_buf.release();
+
+    grad_buf.release();
+    rho_c_buf.release();
+
+    p11_buf.release();
+    p12_buf.release();
+    p21_buf.release();
+    p22_buf.release();
+
+    diff_buf.release();
+    norm_buf.release();
+}
+
+#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index f3917d59b5..736256cd51 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1669,6 +1669,117 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Array, testing::Combine(
     ALL_CMP_CODES,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Compare_Scalar
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void compareScalarImpl(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst)
+    {
+        Op<T> op;
+
+        const int cn = src.channels();
+
+        dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    T src_val = src.at<T>(y, x * cn + c);
+                    T sc_val = cv::saturate_cast<T>(sc.val[c]);
+                    dst.at<uchar>(y, x * cn + c) = static_cast<uchar>(static_cast<int>(op(src_val, sc_val)) * 255);
+                }
+            }
+        }
+    }
+
+    void compareScalarGold(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst, int cmpop)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst);
+        static const func_t funcs[7][6] =
+        {
+            {compareScalarImpl<std::equal_to, unsigned char> , compareScalarImpl<std::greater, unsigned char> , compareScalarImpl<std::greater_equal, unsigned char> , compareScalarImpl<std::less, unsigned char> , compareScalarImpl<std::less_equal, unsigned char> , compareScalarImpl<std::not_equal_to, unsigned char> },
+            {compareScalarImpl<std::equal_to, signed char>   , compareScalarImpl<std::greater, signed char>   , compareScalarImpl<std::greater_equal, signed char>   , compareScalarImpl<std::less, signed char>   , compareScalarImpl<std::less_equal, signed char>   , compareScalarImpl<std::not_equal_to, signed char>   },
+            {compareScalarImpl<std::equal_to, unsigned short>, compareScalarImpl<std::greater, unsigned short>, compareScalarImpl<std::greater_equal, unsigned short>, compareScalarImpl<std::less, unsigned short>, compareScalarImpl<std::less_equal, unsigned short>, compareScalarImpl<std::not_equal_to, unsigned short>},
+            {compareScalarImpl<std::equal_to, short>         , compareScalarImpl<std::greater, short>         , compareScalarImpl<std::greater_equal, short>         , compareScalarImpl<std::less, short>         , compareScalarImpl<std::less_equal, short>         , compareScalarImpl<std::not_equal_to, short>         },
+            {compareScalarImpl<std::equal_to, int>           , compareScalarImpl<std::greater, int>           , compareScalarImpl<std::greater_equal, int>           , compareScalarImpl<std::less, int>           , compareScalarImpl<std::less_equal, int>           , compareScalarImpl<std::not_equal_to, int>           },
+            {compareScalarImpl<std::equal_to, float>         , compareScalarImpl<std::greater, float>         , compareScalarImpl<std::greater_equal, float>         , compareScalarImpl<std::less, float>         , compareScalarImpl<std::less_equal, float>         , compareScalarImpl<std::not_equal_to, float>         },
+            {compareScalarImpl<std::equal_to, double>        , compareScalarImpl<std::greater, double>        , compareScalarImpl<std::greater_equal, double>        , compareScalarImpl<std::less, double>        , compareScalarImpl<std::less_equal, double>        , compareScalarImpl<std::not_equal_to, double>        }
+        };
+
+        funcs[src.depth()][cmpop](src, sc, dst);
+    }
+}
+
+PARAM_TEST_CASE(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatType, CmpCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Compare_Scalar, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar sc = randomScalar(0.0, 255.0);
+
+    if (src.depth() < CV_32F)
+    {
+        sc.val[0] = cvRound(sc.val[0]);
+        sc.val[1] = cvRound(sc.val[1]);
+        sc.val[2] = cvRound(sc.val[2]);
+        sc.val[3] = cvRound(sc.val[3]);
+    }
+
+    if (src.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::compare(loadMat(src), sc, dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(CV_8U, src.channels()), useRoi);
+
+        cv::gpu::compare(loadMat(src, useRoi), sc, dst, cmp_code);
+
+        cv::Mat dst_gold;
+        compareScalarGold(src, sc, dst_gold, cmp_code);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array
 
@@ -2807,10 +2918,12 @@ PARAM_TEST_CASE(Norm, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
 GPU_TEST_P(Norm, Accuracy)
 {
     cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
 
-    double val = cv::gpu::norm(loadMat(src, useRoi), normCode);
+    cv::gpu::GpuMat d_buf;
+    double val = cv::gpu::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf);
 
-    double val_gold = cv::norm(src, normCode);
+    double val_gold = cv::norm(src, normCode, mask);
 
     EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
 }
@@ -3427,4 +3540,70 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
     ALL_REDUCE_CODES,
     WHOLE_SUBMAT));
 
+//////////////////////////////////////////////////////////////////////////////
+// Normalize
+
+PARAM_TEST_CASE(Normalize, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int norm_type;
+    bool useRoi;
+
+    double alpha;
+    double beta;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        norm_type = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        alpha = 1;
+        beta = 0;
+    }
+
+};
+
+GPU_TEST_P(Normalize, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type);
+
+    cv::Mat dst_gold;
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+GPU_TEST_P(Normalize, WithMask)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type, loadMat(mask, useRoi));
+
+    cv::Mat dst_gold(size, type);
+    dst_gold.setTo(cv::Scalar::all(0));
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Normalize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
+    WHOLE_SUBMAT));
+
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
index 6bc471ecef..a97516f6d1 100644
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -401,4 +401,223 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, FarnebackOpticalFlow, testing::Combine(
     testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
     testing::Values(UseInitFlow(false), UseInitFlow(true))));
 
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::gpu::DeviceInfo, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(OpticalFlowDual_TVL1, Accuracy)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
+    cv::gpu::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi);
+    cv::gpu::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
+    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
+
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::Mat flow;
+    alg->calc(frame0, frame1, flow);
+    cv::Mat gold[2];
+    cv::split(flow, gold);
+
+    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3);
+    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowDual_TVL1, testing::Combine(
+    ALL_DEVICES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////
+// OpticalFlowBM
+
+namespace
+{
+    void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                           cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                           cv::Mat& velx, cv::Mat& vely)
+    {
+        cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+        velx.create(sz, CV_32FC1);
+        vely.create(sz, CV_32FC1);
+
+        CvMat cvprev = prev;
+        CvMat cvcurr = curr;
+
+        CvMat cvvelx = velx;
+        CvMat cvvely = vely;
+
+        cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+    }
+}
+
+struct OpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+};
+
+GPU_TEST_P(OpticalFlowBM, Accuracy)
+{
+    cv::gpu::DeviceInfo devInfo = GetParam();
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size block_size(16, 16);
+    cv::Size shift_size(1, 1);
+    cv::Size max_range(16, 16);
+
+    cv::gpu::GpuMat d_velx, d_vely, buf;
+    cv::gpu::calcOpticalFlowBM(loadMat(frame0), loadMat(frame1),
+                               block_size, shift_size, max_range, false,
+                               d_velx, d_vely, buf);
+
+    cv::Mat velx, vely;
+    calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
+
+    EXPECT_MAT_NEAR(velx, d_velx, 0);
+    EXPECT_MAT_NEAR(vely, d_vely, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowBM, ALL_DEVICES);
+
+//////////////////////////////////////////////////////
+// FastOpticalFlowBM
+
+namespace
+{
+    void FastOpticalFlowBM_gold(const cv::Mat_<uchar>& I0, const cv::Mat_<uchar>& I1, cv::Mat_<float>& velx, cv::Mat_<float>& vely, int search_window, int block_window)
+    {
+        velx.create(I0.size());
+        vely.create(I0.size());
+
+        int search_radius = search_window / 2;
+        int block_radius = block_window / 2;
+
+        for (int y = 0; y < I0.rows; ++y)
+        {
+            for (int x = 0; x < I0.cols; ++x)
+            {
+                int bestDist = std::numeric_limits<int>::max();
+                int bestDx = 0;
+                int bestDy = 0;
+
+                for (int dy = -search_radius; dy <= search_radius; ++dy)
+                {
+                    for (int dx = -search_radius; dx <= search_radius; ++dx)
+                    {
+                        int dist = 0;
+
+                        for (int by = -block_radius; by <= block_radius; ++by)
+                        {
+                            for (int bx = -block_radius; bx <= block_radius; ++bx)
+                            {
+                                int I0_val = I0(cv::borderInterpolate(y + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + bx, I0.cols, cv::BORDER_DEFAULT));
+                                int I1_val = I1(cv::borderInterpolate(y + dy + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + dx + bx, I0.cols, cv::BORDER_DEFAULT));
+
+                                dist += std::abs(I0_val - I1_val);
+                            }
+                        }
+
+                        if (dist < bestDist)
+                        {
+                            bestDist = dist;
+                            bestDx = dx;
+                            bestDy = dy;
+                        }
+                    }
+                }
+
+                velx(y, x) = (float) bestDx;
+                vely(y, x) = (float) bestDy;
+            }
+        }
+    }
+
+    double calc_rmse(const cv::Mat_<float>& flow1, const cv::Mat_<float>& flow2)
+    {
+        double sum = 0.0;
+
+        for (int y = 0; y < flow1.rows; ++y)
+        {
+            for (int x = 0; x < flow1.cols; ++x)
+            {
+                double diff = flow1(y, x) - flow2(y, x);
+                sum += diff * diff;
+            }
+        }
+
+        return std::sqrt(sum / flow1.size().area());
+    }
+}
+
+struct FastOpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+};
+
+GPU_TEST_P(FastOpticalFlowBM, Accuracy)
+{
+    const double MAX_RMSE = 0.6;
+
+    int search_window = 15;
+    int block_window = 5;
+
+    cv::gpu::DeviceInfo devInfo = GetParam();
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Size smallSize(320, 240);
+    cv::Mat frame0_small;
+    cv::Mat frame1_small;
+
+    cv::resize(frame0, frame0_small, smallSize);
+    cv::resize(frame1, frame1_small, smallSize);
+
+    cv::gpu::GpuMat d_flowx;
+    cv::gpu::GpuMat d_flowy;
+    cv::gpu::FastOpticalFlowBM fastBM;
+
+    fastBM(loadMat(frame0_small), loadMat(frame1_small), d_flowx, d_flowy, search_window, block_window);
+
+    cv::Mat_<float> flowx;
+    cv::Mat_<float> flowy;
+    FastOpticalFlowBM_gold(frame0_small, frame1_small, flowx, flowy, search_window, block_window);
+
+    double err;
+
+    err = calc_rmse(flowx, cv::Mat(d_flowx));
+    EXPECT_LE(err, MAX_RMSE);
+
+    err = calc_rmse(flowy, cv::Mat(d_flowy));
+    EXPECT_LE(err, MAX_RMSE);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, FastOpticalFlowBM, ALL_DEVICES);
+
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_stream.cpp b/modules/gpu/test/test_stream.cpp
new file mode 100644
index 0000000000..4adac41292
--- /dev/null
+++ b/modules/gpu/test/test_stream.cpp
@@ -0,0 +1,130 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#if CUDA_VERSION >= 5000
+
+struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::CudaMem src;
+    cv::gpu::GpuMat d_src;
+
+    cv::gpu::CudaMem dst;
+    cv::gpu::GpuMat d_dst;
+
+    virtual void SetUp()
+    {
+        cv::gpu::DeviceInfo devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
+        src.create(m.size(), m.type(), cv::gpu::CudaMem::ALLOC_PAGE_LOCKED);
+        m.copyTo(src.createMatHeader());
+    }
+};
+
+void checkMemSet(cv::gpu::Stream&, int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::Mat src = test->src;
+    cv::Mat dst = test->dst;
+
+    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, MemSet)
+{
+    cv::gpu::Stream stream;
+
+    d_dst.upload(src);
+
+    stream.enqueueMemSet(d_dst, cv::Scalar::all(0));
+    stream.enqueueDownload(d_dst, dst);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkMemSet, test);
+
+    stream.waitForCompletion();
+}
+
+void checkConvert(cv::gpu::Stream&, int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::Mat src = test->src;
+    cv::Mat dst = test->dst;
+
+    cv::Mat dst_gold;
+    src.convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, Convert)
+{
+    cv::gpu::Stream stream;
+
+    stream.enqueueUpload(src, d_src);
+    stream.enqueueConvert(d_src, d_dst, CV_32S);
+    stream.enqueueDownload(d_dst, dst);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkConvert, test);
+
+    stream.waitForCompletion();
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Stream, Async, ALL_DEVICES);
+
+#endif
+
+#endif // HAVE_CUDA
diff --git a/modules/video/doc/motion_analysis_and_object_tracking.rst b/modules/video/doc/motion_analysis_and_object_tracking.rst
index 3a5d2bdd41..2c5dcb1a8e 100644
--- a/modules/video/doc/motion_analysis_and_object_tracking.rst
+++ b/modules/video/doc/motion_analysis_and_object_tracking.rst
@@ -641,6 +641,72 @@ Calculate an optical flow using "SimpleFlow" algorithm.
 
 See [Tao2012]_. And site of project - http://graphics.berkeley.edu/papers/Tao-SAN-2012-05/.
 
+
+
+createOptFlow_DualTVL1
+----------------------
+"Dual TV L1" Optical Flow Algorithm.
+
+.. ocv:function:: Ptr<DenseOpticalFlow> createOptFlow_DualTVL1()
+
+
+The class implements the "Dual TV L1" optical flow algorithm described in [Zach2007]_ and [Javier2012]_ .
+
+Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+
+    .. ocv:member:: double tau
+
+        Time step of the numerical scheme.
+
+    .. ocv:member:: double lambda
+
+        Weight parameter for the data term, attachment parameter. This is the most relevant parameter, which determines the smoothness of the output. The smaller this parameter is, the smoother the solutions we obtain. It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+
+    .. ocv:member:: double theta
+
+        Weight parameter for (u - v)^2, tightness parameter. It serves as a link between the attachment and the regularization terms. In theory, it should have a small value in order to maintain both parts in correspondence. The method is stable for a large range of values of this parameter.
+
+    .. ocv:member:: int nscales
+
+        Number of scales used to create the pyramid of images.
+
+    .. ocv:member:: int warps
+
+        Number of warpings per scale. Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale. This is a parameter that assures the stability of the method. It also affects the running time, so it is a compromise between speed and accuracy.
+
+    .. ocv:member:: double epsilon
+
+        Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. A small value will yield more accurate solutions at the expense of a slower convergence.
+
+    .. ocv:member:: int iterations
+
+        Stopping criterion iterations number used in the numerical scheme.
+
+
+
+
+DenseOpticalFlow::calc
+--------------------------
+Calculates an optical flow.
+
+.. ocv:function:: void DenseOpticalFlow::calc(InputArray I0, InputArray I1, InputOutputArray flow)
+
+    :param prev: first 8-bit single-channel input image.
+
+    :param next: second input image of the same size and the same type as ``prev`` .
+
+    :param flow: computed flow image that has the same size as ``prev`` and type ``CV_32FC2`` .
+
+
+
+DenseOpticalFlow::collectGarbage
+--------------------------------
+Releases all inner buffers.
+
+.. ocv:function:: void DenseOpticalFlow::collectGarbage()
+
+
+
 .. [Bouguet00] Jean-Yves Bouguet. Pyramidal Implementation of the Lucas Kanade Feature Tracker.
 
 .. [Bradski98] Bradski, G.R. "Computer Vision Face Tracking for Use in a Perceptual User Interface", Intel, 1998
@@ -658,3 +724,7 @@ See [Tao2012]_. And site of project - http://graphics.berkeley.edu/papers/Tao-SA
 .. [Welch95] Greg Welch and Gary Bishop “An Introduction to the Kalman Filter”, 1995
 
 .. [Tao2012] Michael Tao, Jiamin Bai, Pushmeet Kohli and Sylvain Paris. SimpleFlow: A Non-iterative, Sublinear Optical Flow Algorithm. Computer Graphics Forum (Eurographics 2012)
+
+.. [Zach2007] C. Zach, T. Pock and H. Bischof. "A Duality Based Approach for Realtime TV-L1 Optical Flow", In Proceedings of Pattern Recognition (DAGM), Heidelberg, Germany, pp. 214-223, 2007
+
+.. [Javier2012] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index b936e12e2f..48e84c8307 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -352,6 +352,20 @@ CV_EXPORTS_W void calcOpticalFlowSF(Mat& from,
                                     double upscale_sigma_color,
                                     double speed_up_thr);
 
+class CV_EXPORTS DenseOpticalFlow : public Algorithm
+{
+public:
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) = 0;
+    virtual void collectGarbage() = 0;
+};
+
+// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
+//
+// see reference:
+//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+CV_EXPORTS Ptr<DenseOpticalFlow> createOptFlow_DualTVL1();
+
 }
 
 #endif
diff --git a/modules/video/perf/perf_tvl1optflow.cpp b/modules/video/perf/perf_tvl1optflow.cpp
new file mode 100644
index 0000000000..ad90915128
--- /dev/null
+++ b/modules/video/perf/perf_tvl1optflow.cpp
@@ -0,0 +1,30 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+
+typedef TestBaseWithParam< pair<string, string> > ImagePair;
+
+pair<string, string> impair(const char* im1, const char* im2)
+{
+    return make_pair(string(im1), string(im2));
+}
+
+PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, testing::Values(impair("cv/optflow/RubberWhale1.png", "cv/optflow/RubberWhale2.png")))
+{
+    declare.time(40);
+
+    Mat frame1 = imread(getDataPath(GetParam().first), IMREAD_GRAYSCALE);
+    Mat frame2 = imread(getDataPath(GetParam().second), IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+
+    Mat flow;
+
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
+
+    TEST_CYCLE_N(10) tvl1->calc(frame1, frame2, flow);
+
+    SANITY_CHECK(flow, 0.5);
+}
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
new file mode 100644
index 0000000000..bff1d7ec0d
--- /dev/null
+++ b/modules/video/src/tvl1flow.cpp
@@ -0,0 +1,937 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+//
+// This implementation is based on Javier Sánchez Pérez <jsanchez@dis.ulpgc.es> implementation.
+// Original BSD license:
+//
+// Copyright (c) 2011, Javier Sánchez Pérez, Enric Meinhardt Llopis
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+
+class OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
+    OpticalFlowDual_TVL1();
+
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow);
+    void collectGarbage();
+
+    AlgorithmInfo* info() const;
+
+protected:
+    double tau;
+    double lambda;
+    double theta;
+    int nscales;
+    int warps;
+    double epsilon;
+    int iterations;
+    bool useInitialFlow;
+
+private:
+    void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2);
+
+    std::vector<Mat_<float> > I0s;
+    std::vector<Mat_<float> > I1s;
+    std::vector<Mat_<float> > u1s;
+    std::vector<Mat_<float> > u2s;
+
+    Mat_<float> I1x_buf;
+    Mat_<float> I1y_buf;
+
+    Mat_<float> flowMap1_buf;
+    Mat_<float> flowMap2_buf;
+
+    Mat_<float> I1w_buf;
+    Mat_<float> I1wx_buf;
+    Mat_<float> I1wy_buf;
+
+    Mat_<float> grad_buf;
+    Mat_<float> rho_c_buf;
+
+    Mat_<float> v1_buf;
+    Mat_<float> v2_buf;
+
+    Mat_<float> p11_buf;
+    Mat_<float> p12_buf;
+    Mat_<float> p21_buf;
+    Mat_<float> p22_buf;
+
+    Mat_<float> div_p1_buf;
+    Mat_<float> div_p2_buf;
+
+    Mat_<float> u1x_buf;
+    Mat_<float> u1y_buf;
+    Mat_<float> u2x_buf;
+    Mat_<float> u2y_buf;
+};
+
+OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    iterations     = 300;
+    useInitialFlow = false;
+}
+
+void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
+{
+    Mat I0 = _I0.getMat();
+    Mat I1 = _I1.getMat();
+
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (_flow.size() == I0.size() && _flow.type() == CV_32FC2) );
+    CV_Assert( nscales > 0 );
+
+    // allocate memory for the pyramid structure
+    I0s.resize(nscales);
+    I1s.resize(nscales);
+    u1s.resize(nscales);
+    u2s.resize(nscales);
+
+    I0.convertTo(I0s[0], I0s[0].depth(), I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(I1s[0], I1s[0].depth(), I1.depth() == CV_8U ? 1.0 : 255.0);
+
+    if (useInitialFlow)
+    {
+        u1s[0].create(I0.size());
+        u2s[0].create(I0.size());
+
+        Mat_<float> mv[] = {u1s[0], u2s[0]};
+
+        split(_flow.getMat(), mv);
+    }
+
+    I1x_buf.create(I0.size());
+    I1y_buf.create(I0.size());
+
+    flowMap1_buf.create(I0.size());
+    flowMap2_buf.create(I0.size());
+
+    I1w_buf.create(I0.size());
+    I1wx_buf.create(I0.size());
+    I1wy_buf.create(I0.size());
+
+    grad_buf.create(I0.size());
+    rho_c_buf.create(I0.size());
+
+    v1_buf.create(I0.size());
+    v2_buf.create(I0.size());
+
+    p11_buf.create(I0.size());
+    p12_buf.create(I0.size());
+    p21_buf.create(I0.size());
+    p22_buf.create(I0.size());
+
+    div_p1_buf.create(I0.size());
+    div_p2_buf.create(I0.size());
+
+    u1x_buf.create(I0.size());
+    u1y_buf.create(I0.size());
+    u2x_buf.create(I0.size());
+    u2y_buf.create(I0.size());
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        pyrDown(I0s[s - 1], I0s[s]);
+        pyrDown(I1s[s - 1], I1s[s]);
+
+        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            pyrDown(u1s[s - 1], u1s[s]);
+            pyrDown(u2s[s - 1], u2s[s]);
+
+            multiply(u1s[s], Scalar::all(0.5), u1s[s]);
+            multiply(u2s[s], Scalar::all(0.5), u2s[s]);
+        }
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // otherwise, upsample the optical flow
+
+        // zoom the optical flow for the next finer scale
+        resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
+        resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        multiply(u1s[s - 1], Scalar::all(2), u1s[s - 1]);
+        multiply(u2s[s - 1], Scalar::all(2), u2s[s - 1]);
+    }
+
+    Mat uxy[] = {u1s[0], u2s[0]};
+    merge(uxy, 2, _flow);
+}
+
+////////////////////////////////////////////////////////////
+// buildFlowMap
+
+struct BuildFlowMapBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> map1;
+    mutable Mat_<float> map2;
+};
+
+void BuildFlowMapBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+
+        float* map1Row = map1[y];
+        float* map2Row = map2[y];
+
+        for (int x = 0; x < u1.cols; ++x)
+        {
+            map1Row[x] = x + u1Row[x];
+            map2Row[x] = y + u2Row[x];
+        }
+    }
+}
+
+void buildFlowMap(const Mat_<float>& u1, const Mat_<float>& u2, Mat_<float>& map1, Mat_<float>& map2)
+{
+    CV_DbgAssert( u2.size() == u1.size() );
+    CV_DbgAssert( map1.size() == u1.size() );
+    CV_DbgAssert( map2.size() == u1.size() );
+
+    BuildFlowMapBody body;
+
+    body.u1 = u1;
+    body.u2 = u2;
+    body.map1 = map1;
+    body.map2 = map2;
+
+    parallel_for_(Range(0, u1.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+struct CenteredGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+
+void CenteredGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcPrevRow = src[y - 1];
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+
+        for (int x = 1; x < last_col; ++x)
+        {
+            dxRow[x] = 0.5f * (srcCurRow[x + 1] - srcCurRow[x - 1]);
+            dyRow[x] = 0.5f * (srcNextRow[x] - srcPrevRow[x]);
+        }
+    }
+}
+
+void centeredGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+
+    // compute the gradient on the center body of the image
+    {
+        CenteredGradientBody body;
+
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+
+        parallel_for_(Range(1, last_row), body);
+    }
+
+    // compute the gradient on the first and last rows
+    for (int x = 1; x < last_col; ++x)
+    {
+        dx(0, x) = 0.5f * (src(0, x + 1) - src(0, x - 1));
+        dy(0, x) = 0.5f * (src(1, x) - src(0, x));
+
+        dx(last_row, x) = 0.5f * (src(last_row, x + 1) - src(last_row, x - 1));
+        dy(last_row, x) = 0.5f * (src(last_row, x) - src(last_row - 1, x));
+    }
+
+    // compute the gradient on the first and last columns
+    for (int y = 1; y < last_row; ++y)
+    {
+        dx(y, 0) = 0.5f * (src(y, 1) - src(y, 0));
+        dy(y, 0) = 0.5f * (src(y + 1, 0) - src(y - 1, 0));
+
+        dx(y, last_col) = 0.5f * (src(y, last_col) - src(y, last_col - 1));
+        dy(y, last_col) = 0.5f * (src(y + 1, last_col) - src(y - 1, last_col));
+    }
+
+    // compute the gradient at the four corners
+    dx(0, 0) = 0.5f * (src(0, 1) - src(0, 0));
+    dy(0, 0) = 0.5f * (src(1, 0) - src(0, 0));
+
+    dx(0, last_col) = 0.5f * (src(0, last_col) - src(0, last_col - 1));
+    dy(0, last_col) = 0.5f * (src(1, last_col) - src(0, last_col));
+
+    dx(last_row, 0) = 0.5f * (src(last_row, 1) - src(last_row, 0));
+    dy(last_row, 0) = 0.5f * (src(last_row, 0) - src(last_row - 1, 0));
+
+    dx(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row, last_col - 1));
+    dy(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row - 1, last_col));
+}
+
+////////////////////////////////////////////////////////////
+// forwardGradient
+
+struct ForwardGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+
+void ForwardGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+
+        for (int x = 0; x < last_col; ++x)
+        {
+            dxRow[x] = srcCurRow[x + 1] - srcCurRow[x];
+            dyRow[x] = srcNextRow[x] - srcCurRow[x];
+        }
+    }
+}
+
+void forwardGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+
+    // compute the gradient on the central body of the image
+    {
+        ForwardGradientBody body;
+
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+
+        parallel_for_(Range(0, last_row), body);
+    }
+
+    // compute the gradient on the last row
+    for (int x = 0; x < last_col; ++x)
+    {
+        dx(last_row, x) = src(last_row, x + 1) - src(last_row, x);
+        dy(last_row, x) = 0.0f;
+    }
+
+    // compute the gradient on the last column
+    for (int y = 0; y < last_row; ++y)
+    {
+        dx(y, last_col) = 0.0f;
+        dy(y, last_col) = src(y + 1, last_col) - src(y, last_col);
+    }
+
+    dx(last_row, last_col) = 0.0f;
+    dy(last_row, last_col) = 0.0f;
+}
+
+////////////////////////////////////////////////////////////
+// divergence
+
+struct DivergenceBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> v1;
+    Mat_<float> v2;
+    mutable Mat_<float> div;
+};
+
+void DivergenceBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2PrevRow = v2[y - 1];
+        const float* v2CurRow = v2[y];
+
+        float* divRow = div[y];
+
+        for(int x = 1; x < v1.cols; ++x)
+        {
+            const float v1x = v1Row[x] - v1Row[x - 1];
+            const float v2y = v2CurRow[x] - v2PrevRow[x];
+
+            divRow[x] = v1x + v2y;
+        }
+    }
+}
+
+void divergence(const Mat_<float>& v1, const Mat_<float>& v2, Mat_<float>& div)
+{
+    CV_DbgAssert( v1.rows > 2 && v1.cols > 2 );
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div.size() == v1.size() );
+
+    {
+        DivergenceBody body;
+
+        body.v1 = v1;
+        body.v2 = v2;
+        body.div = div;
+
+        parallel_for_(Range(1, v1.rows), body);
+    }
+
+    // compute the divergence on the first row
+    for(int x = 1; x < v1.cols; ++x)
+        div(0, x) = v1(0, x) - v1(0, x - 1) + v2(0, x);
+
+    // compute the divergence on the first column
+    for (int y = 1; y < v1.rows; ++y)
+        div(y, 0) = v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+
+    div(0, 0) = v1(0, 0) + v2(0, 0);
+}
+
+////////////////////////////////////////////////////////////
+// calcGradRho
+
+struct CalcGradRhoBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> I0;
+    Mat_<float> I1w;
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> grad;
+    mutable Mat_<float> rho_c;
+};
+
+void CalcGradRhoBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I0Row = I0[y];
+        const float* I1wRow = I1w[y];
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+
+        float* gradRow = grad[y];
+        float* rhoRow = rho_c[y];
+
+        for (int x = 0; x < I0.cols; ++x)
+        {
+            const float Ix2 = I1wxRow[x] * I1wxRow[x];
+            const float Iy2 = I1wyRow[x] * I1wyRow[x];
+
+            // store the |Grad(I1)|^2
+            gradRow[x] = Ix2 + Iy2;
+
+            // compute the constant part of the rho function
+            rhoRow[x] = (I1wRow[x] - I1wxRow[x] * u1Row[x] - I1wyRow[x] * u2Row[x] - I0Row[x]);
+        }
+    }
+}
+
+void calcGradRho(const Mat_<float>& I0, const Mat_<float>& I1w, const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2,
+    Mat_<float>& grad, Mat_<float>& rho_c)
+{
+    CV_DbgAssert( I1w.size() == I0.size() );
+    CV_DbgAssert( I1wx.size() == I0.size() );
+    CV_DbgAssert( I1wy.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == I0.size() );
+    CV_DbgAssert( grad.size() == I0.size() );
+    CV_DbgAssert( rho_c.size() == I0.size() );
+
+    CalcGradRhoBody body;
+
+    body.I0 = I0;
+    body.I1w = I1w;
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.grad = grad;
+    body.rho_c = rho_c;
+
+    parallel_for_(Range(0, I0.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// estimateV
+
+struct EstimateVBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    Mat_<float> grad;
+    Mat_<float> rho_c;
+    mutable Mat_<float> v1;
+    mutable Mat_<float> v2;
+    float l_t;
+};
+
+void EstimateVBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+        const float* gradRow = grad[y];
+        const float* rhoRow = rho_c[y];
+
+        float* v1Row = v1[y];
+        float* v2Row = v2[y];
+
+        for (int x = 0; x < I1wx.cols; ++x)
+        {
+            const float rho = rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]);
+
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+
+            if (rho < -l_t * gradRow[x])
+            {
+                d1 = l_t * I1wxRow[x];
+                d2 = l_t * I1wyRow[x];
+            }
+            else if (rho > l_t * gradRow[x])
+            {
+                d1 = -l_t * I1wxRow[x];
+                d2 = -l_t * I1wyRow[x];
+            }
+            else if (gradRow[x] > numeric_limits<float>::epsilon())
+            {
+                float fi = -rho / gradRow[x];
+                d1 = fi * I1wxRow[x];
+                d2 = fi * I1wyRow[x];
+            }
+
+            v1Row[x] = u1Row[x] + d1;
+            v2Row[x] = u2Row[x] + d2;
+        }
+    }
+}
+
+void estimateV(const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2, const Mat_<float>& grad, const Mat_<float>& rho_c,
+               Mat_<float>& v1, Mat_<float>& v2, float l_t)
+{
+    CV_DbgAssert( I1wy.size() == I1wx.size() );
+    CV_DbgAssert( u1.size() == I1wx.size() );
+    CV_DbgAssert( u2.size() == I1wx.size() );
+    CV_DbgAssert( grad.size() == I1wx.size() );
+    CV_DbgAssert( rho_c.size() == I1wx.size() );
+    CV_DbgAssert( v1.size() == I1wx.size() );
+    CV_DbgAssert( v2.size() == I1wx.size() );
+
+    EstimateVBody body;
+
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.grad = grad;
+    body.rho_c = rho_c;
+    body.v1 = v1;
+    body.v2 = v2;
+    body.l_t = l_t;
+
+    parallel_for_(Range(0, I1wx.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+float estimateU(const Mat_<float>& v1, const Mat_<float>& v2, const Mat_<float>& div_p1, const Mat_<float>& div_p2, Mat_<float>& u1, Mat_<float>& u2, float theta)
+{
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div_p1.size() == v1.size() );
+    CV_DbgAssert( div_p2.size() == v1.size() );
+    CV_DbgAssert( u1.size() == v1.size() );
+    CV_DbgAssert( u2.size() == v1.size() );
+
+    float error = 0.0f;
+    for (int y = 0; y < v1.rows; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2Row = v2[y];
+        const float* divP1Row = div_p1[y];
+        const float* divP2Row = div_p2[y];
+
+        float* u1Row = u1[y];
+        float* u2Row = u2[y];
+
+        for (int x = 0; x < v1.cols; ++x)
+        {
+            const float u1k = u1Row[x];
+            const float u2k = u2Row[x];
+
+            u1Row[x] = v1Row[x] + theta * divP1Row[x];
+            u2Row[x] = v2Row[x] + theta * divP2Row[x];
+
+            error += (u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k);
+        }
+    }
+
+    return error;
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+struct EstimateDualVariablesBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> u1x;
+    Mat_<float> u1y;
+    Mat_<float> u2x;
+    Mat_<float> u2y;
+    mutable Mat_<float> p11;
+    mutable Mat_<float> p12;
+    mutable Mat_<float> p21;
+    mutable Mat_<float> p22;
+    float taut;
+};
+
+void EstimateDualVariablesBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* u1xRow = u1x[y];
+        const float* u1yRow = u1y[y];
+        const float* u2xRow = u2x[y];
+        const float* u2yRow = u2y[y];
+
+        float* p11Row = p11[y];
+        float* p12Row = p12[y];
+        float* p21Row = p21[y];
+        float* p22Row = p22[y];
+
+        for (int x = 0; x < u1x.cols; ++x)
+        {
+            const float g1 = static_cast<float>(hypot(u1xRow[x], u1yRow[x]));
+            const float g2 = static_cast<float>(hypot(u2xRow[x], u2yRow[x]));
+
+            const float ng1  = 1.0f + taut * g1;
+            const float ng2  = 1.0f + taut * g2;
+
+            p11Row[x] = (p11Row[x] + taut * u1xRow[x]) / ng1;
+            p12Row[x] = (p12Row[x] + taut * u1yRow[x]) / ng1;
+            p21Row[x] = (p21Row[x] + taut * u2xRow[x]) / ng2;
+            p22Row[x] = (p22Row[x] + taut * u2yRow[x]) / ng2;
+        }
+    }
+}
+
+void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y, const Mat_<float>& u2x, const Mat_<float>& u2y,
+                           Mat_<float>& p11, Mat_<float>& p12, Mat_<float>& p21, Mat_<float>& p22, float taut)
+{
+    CV_DbgAssert( u1y.size() == u1x.size() );
+    CV_DbgAssert( u2x.size() == u1x.size() );
+    CV_DbgAssert( u2y.size() == u1x.size() );
+    CV_DbgAssert( p11.size() == u1x.size() );
+    CV_DbgAssert( p12.size() == u1x.size() );
+    CV_DbgAssert( p21.size() == u1x.size() );
+    CV_DbgAssert( p22.size() == u1x.size() );
+
+    EstimateDualVariablesBody body;
+
+    body.u1x = u1x;
+    body.u1y = u1y;
+    body.u2x = u2x;
+    body.u2y = u2y;
+    body.p11 = p11;
+    body.p12 = p12;
+    body.p21 = p21;
+    body.p22 = p22;
+    body.taut = taut;
+
+    parallel_for_(Range(0, u1x.rows), body);
+}
+
+void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2)
+{
+    const float scaledEpsilon = static_cast<float>(epsilon * epsilon * I0.size().area());
+
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+
+    if (u1.empty())
+    {
+        u1.create(I0.size());
+        u1.setTo(Scalar::all(0));
+
+        u2.create(I0.size());
+        u2.setTo(Scalar::all(0));
+    }
+
+    Mat_<float> I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    centeredGradient(I1, I1x, I1y);
+
+    Mat_<float> flowMap1 = flowMap1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> flowMap2 = flowMap2_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    Mat_<float> I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    Mat_<float> grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    Mat_<float> v1 = v1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> v2 = v2_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    Mat_<float> p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    Mat_<float> div_p1 = div_p1_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> div_p2 = div_p2_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    Mat_<float> u1x = u1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u1y = u1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2x = u2x_buf(Rect(0, 0, I0.cols, I0.rows));
+    Mat_<float> u2y = u2y_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        // compute the warping of the target image and its derivatives
+        buildFlowMap(u1, u2, flowMap1, flowMap2);
+        remap(I1, I1w, flowMap1, flowMap2, INTER_CUBIC);
+        remap(I1x, I1wx, flowMap1, flowMap2, INTER_CUBIC);
+        remap(I1y, I1wy, flowMap1, flowMap2, INTER_CUBIC);
+
+        calcGradRho(I0, I1w, I1wx, I1wy, u1, u2, grad, rho_c);
+
+        float error = numeric_limits<float>::max();
+        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        {
+            // estimate the values of the variable (v1, v2) (thresholding operator TH)
+            estimateV(I1wx, I1wy, u1, u2, grad, rho_c, v1, v2, l_t);
+
+            // compute the divergence of the dual variable (p1, p2)
+            divergence(p11, p12, div_p1);
+            divergence(p21, p22, div_p2);
+
+            // estimate the values of the optical flow (u1, u2)
+            error = estimateU(v1, v2, div_p1, div_p2, u1, u2, static_cast<float>(theta));
+
+            // compute the gradient of the optical flow (Du1, Du2)
+            forwardGradient(u1, u1x, u1y);
+            forwardGradient(u2, u2x, u2y);
+
+            // estimate the values of the dual variable (p1, p2)
+            estimateDualVariables(u1x, u1y, u2x, u2y, p11, p12, p21, p22, taut);
+        }
+    }
+}
+
+void OpticalFlowDual_TVL1::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
+
+    I1x_buf.release();
+    I1y_buf.release();
+
+    flowMap1_buf.release();
+    flowMap2_buf.release();
+
+    I1w_buf.release();
+    I1wx_buf.release();
+    I1wy_buf.release();
+
+    grad_buf.release();
+    rho_c_buf.release();
+
+    v1_buf.release();
+    v2_buf.release();
+
+    p11_buf.release();
+    p12_buf.release();
+    p21_buf.release();
+    p22_buf.release();
+
+    div_p1_buf.release();
+    div_p2_buf.release();
+
+    u1x_buf.release();
+    u1y_buf.release();
+    u2x_buf.release();
+    u2y_buf.release();
+}
+
+CV_INIT_ALGORITHM(OpticalFlowDual_TVL1, "DenseOpticalFlow.DualTVL1",
+                  obj.info()->addParam(obj, "tau", obj.tau, false, 0, 0,
+                                       "Time step of the numerical scheme");
+                  obj.info()->addParam(obj, "lambda", obj.lambda, false, 0, 0,
+                                       "Weight parameter for the data term, attachment parameter");
+                  obj.info()->addParam(obj, "theta", obj.theta, false, 0, 0,
+                                       "Weight parameter for (u - v)^2, tightness parameter");
+                  obj.info()->addParam(obj, "nscales", obj.nscales, false, 0, 0,
+                                       "Number of scales used to create the pyramid of images");
+                  obj.info()->addParam(obj, "warps", obj.warps, false, 0, 0,
+                                       "Number of warpings per scale");
+                  obj.info()->addParam(obj, "epsilon", obj.epsilon, false, 0, 0,
+                                       "Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time");
+                  obj.info()->addParam(obj, "iterations", obj.iterations, false, 0, 0,
+                                       "Stopping criterion iterations number used in the numerical scheme");
+                  obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow));
+
+} // namespace
+
+Ptr<DenseOpticalFlow> cv::createOptFlow_DualTVL1()
+{
+    return new OpticalFlowDual_TVL1;
+}
diff --git a/modules/video/test/test_tvl1optflow.cpp b/modules/video/test/test_tvl1optflow.cpp
new file mode 100644
index 0000000000..804eae8b62
--- /dev/null
+++ b/modules/video/test/test_tvl1optflow.cpp
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include <fstream>
+
+using namespace std;
+using namespace cv;
+using namespace cvtest;
+
+//#define DUMP
+
+namespace
+{
+    // first four bytes, should be the same in little endian
+    const float FLO_TAG_FLOAT = 202021.25f;  // check for this when READING the file
+    const char FLO_TAG_STRING[] = "PIEH";    // use this when WRITING the file
+
+    // binary file format for flow data specified here:
+    // http://vision.middlebury.edu/flow/data/
+    void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
+    {
+        ofstream file(fileName.c_str(), ios_base::binary);
+
+        file << FLO_TAG_STRING;
+
+        file.write((const char*) &flow.cols, sizeof(int));
+        file.write((const char*) &flow.rows, sizeof(int));
+
+        for (int i = 0; i < flow.rows; ++i)
+        {
+            for (int j = 0; j < flow.cols; ++j)
+            {
+                const Point2f u = flow(i, j);
+
+                file.write((const char*) &u.x, sizeof(float));
+                file.write((const char*) &u.y, sizeof(float));
+            }
+        }
+    }
+
+    // binary file format for flow data specified here:
+    // http://vision.middlebury.edu/flow/data/
+    void readOpticalFlowFromFile(Mat_<Point2f>& flow, const string& fileName)
+    {
+        ifstream file(fileName.c_str(), ios_base::binary);
+
+        float tag;
+        file.read((char*) &tag, sizeof(float));
+        CV_Assert( tag == FLO_TAG_FLOAT );
+
+        Size size;
+
+        file.read((char*) &size.width, sizeof(int));
+        file.read((char*) &size.height, sizeof(int));
+
+        flow.create(size);
+
+        for (int i = 0; i < flow.rows; ++i)
+        {
+            for (int j = 0; j < flow.cols; ++j)
+            {
+                Point2f u;
+
+                file.read((char*) &u.x, sizeof(float));
+                file.read((char*) &u.y, sizeof(float));
+
+                flow(i, j) = u;
+            }
+        }
+    }
+
+    bool isFlowCorrect(Point2f u)
+    {
+        return !cvIsNaN(u.x) && !cvIsNaN(u.y) && (fabs(u.x) < 1e9) && (fabs(u.y) < 1e9);
+    }
+
+    double calcRMSE(const Mat_<Point2f>& flow1, const Mat_<Point2f>& flow2)
+    {
+        double sum = 0.0;
+        int counter = 0;
+
+        for (int i = 0; i < flow1.rows; ++i)
+        {
+            for (int j = 0; j < flow1.cols; ++j)
+            {
+                const Point2f u1 = flow1(i, j);
+                const Point2f u2 = flow2(i, j);
+
+                if (isFlowCorrect(u1) && isFlowCorrect(u2))
+                {
+                    const Point2f diff = u1 - u2;
+                    sum += diff.ddot(diff);
+                    ++counter;
+                }
+            }
+        }
+
+        return sqrt(sum / (1e-9 + counter));
+    }
+}
+
+TEST(Video_calcOpticalFlowDual_TVL1, Regression)
+{
+    const double MAX_RMSE = 0.02;
+
+    const string frame1_path = TS::ptr()->get_data_path() + "optflow/RubberWhale1.png";
+    const string frame2_path = TS::ptr()->get_data_path() + "optflow/RubberWhale2.png";
+    const string gold_flow_path = TS::ptr()->get_data_path() + "optflow/tvl1_flow.flo";
+
+    Mat frame1 = imread(frame1_path, IMREAD_GRAYSCALE);
+    Mat frame2 = imread(frame2_path, IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    ASSERT_FALSE(frame2.empty());
+
+    Mat_<Point2f> flow;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
+
+    tvl1->calc(frame1, frame2, flow);
+
+#ifdef DUMP
+    writeOpticalFlowToFile(flow, gold_flow_path);
+#else
+    Mat_<Point2f> gold;
+    readOpticalFlowFromFile(gold, gold_flow_path);
+
+    ASSERT_EQ(gold.rows, flow.rows);
+    ASSERT_EQ(gold.cols, flow.cols);
+
+    const double err = calcRMSE(gold, flow);
+    EXPECT_LE(err, MAX_RMSE);
+#endif
+}
diff --git a/samples/cpp/tvl1_optical_flow.cpp b/samples/cpp/tvl1_optical_flow.cpp
new file mode 100644
index 0000000000..4f9a02b5a9
--- /dev/null
+++ b/samples/cpp/tvl1_optical_flow.cpp
@@ -0,0 +1,193 @@
+#include <iostream>
+#include <fstream>
+
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float)CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.f;
+        const float col1 = colorWheel[k1][b] / 255.f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.f * col);
+    }
+
+    return pix;
+}
+
+static void drawOpticalFlow(const Mat_<Point2f>& flow, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flow.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flow.rows; ++y)
+        {
+            for (int x = 0; x < flow.cols; ++x)
+            {
+                Point2f u = flow(y, x);
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flow.rows; ++y)
+    {
+        for (int x = 0; x < flow.cols; ++x)
+        {
+            Point2f u = flow(y, x);
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+// binary file format for flow data specified here:
+// http://vision.middlebury.edu/flow/data/
+static void writeOpticalFlowToFile(const Mat_<Point2f>& flow, const string& fileName)
+{
+    static const char FLO_TAG_STRING[] = "PIEH";
+
+    ofstream file(fileName.c_str(), ios_base::binary);
+
+    file << FLO_TAG_STRING;
+
+    file.write((const char*) &flow.cols, sizeof(int));
+    file.write((const char*) &flow.rows, sizeof(int));
+
+    for (int i = 0; i < flow.rows; ++i)
+    {
+        for (int j = 0; j < flow.cols; ++j)
+        {
+            const Point2f u = flow(i, j);
+
+            file.write((const char*) &u.x, sizeof(float));
+            file.write((const char*) &u.y, sizeof(float));
+        }
+    }
+}
+
+int main(int argc, const char* argv[])
+{
+    if (argc < 3)
+    {
+        cerr << "Usage : " << argv[0] << "<frame0> <frame1> [<output_flow>]" << endl;
+        return -1;
+    }
+
+    Mat frame0 = imread(argv[1], IMREAD_GRAYSCALE);
+    Mat frame1 = imread(argv[2], IMREAD_GRAYSCALE);
+
+    if (frame0.empty())
+    {
+        cerr << "Can't open image ["  << argv[1] << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image ["  << argv[2] << "]" << endl;
+        return -1;
+    }
+
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+
+    Mat_<Point2f> flow;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
+
+    const double start = (double)getTickCount();
+    tvl1->calc(frame0, frame1, flow);
+    const double timeSec = (getTickCount() - start) / getTickFrequency();
+    cout << "calcOpticalFlowDual_TVL1 : " << timeSec << " sec" << endl;
+
+    Mat out;
+    drawOpticalFlow(flow, out);
+
+    if (argc == 4)
+        writeOpticalFlowToFile(flow, argv[3]);
+
+    imshow("Flow", out);
+    waitKey();
+
+    return 0;
+}
diff --git a/samples/gpu/houghlines.cpp b/samples/gpu/houghlines.cpp
new file mode 100644
index 0000000000..e98dcc6e7c
--- /dev/null
+++ b/samples/gpu/houghlines.cpp
@@ -0,0 +1,89 @@
+#include <cmath>
+#include <iostream>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+static void help()
+{
+    cout << "This program demonstrates line finding with the Hough transform." << endl;
+    cout << "Usage:" << endl;
+    cout << "./gpu-example-houghlines <image_name>, Default is pic1.png\n" << endl;
+}
+
+int main(int argc, const char* argv[])
+{
+    const string filename = argc >= 2 ? argv[1] : "pic1.png";
+
+    Mat src = imread(filename, IMREAD_GRAYSCALE);
+    if (src.empty())
+    {
+        help();
+        cout << "can not open " << filename << endl;
+        return -1;
+    }
+
+    Mat mask;
+    Canny(src, mask, 100, 200, 3);
+
+    Mat dst_cpu;
+    cvtColor(mask, dst_cpu, CV_GRAY2BGR);
+    Mat dst_gpu = dst_cpu.clone();
+
+    vector<Vec4i> lines_cpu;
+    {
+        const int64 start = getTickCount();
+
+        HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
+        cout << "CPU Found : " << lines_cpu.size() << endl;
+    }
+
+    for (size_t i = 0; i < lines_cpu.size(); ++i)
+    {
+        Vec4i l = lines_cpu[i];
+        line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, CV_AA);
+    }
+
+    GpuMat d_src(mask);
+    GpuMat d_lines;
+    HoughLinesBuf d_buf;
+    {
+        const int64 start = getTickCount();
+
+        gpu::HoughLinesP(d_src, d_lines, d_buf, 1.0f, (float) (CV_PI / 180.0f), 50, 5);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
+        cout << "GPU Found : " << d_lines.cols << endl;
+    }
+    vector<Vec4i> lines_gpu;
+    if (!d_lines.empty())
+    {
+        lines_gpu.resize(d_lines.cols);
+        Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
+        d_lines.download(h_lines);
+    }
+
+    for (size_t i = 0; i < lines_gpu.size(); ++i)
+    {
+        Vec4i l = lines_gpu[i];
+        line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, CV_AA);
+    }
+
+    imshow("source", src);
+    imshow("detected lines [CPU]", dst_cpu);
+    imshow("detected lines [GPU]", dst_gpu);
+    waitKey();
+
+    return 0;
+}
+
diff --git a/samples/gpu/optical_flow.cpp b/samples/gpu/optical_flow.cpp
new file mode 100644
index 0000000000..3f74d1b676
--- /dev/null
+++ b/samples/gpu/optical_flow.cpp
@@ -0,0 +1,253 @@
+#include <iostream>
+#include <fstream>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+inline bool isFlowCorrect(Point2f u)
+{
+    return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9;
+}
+
+static Vec3b computeColor(float fx, float fy)
+{
+    static bool first = true;
+
+    // relative lengths of color transitions:
+    // these are chosen based on perceptual similarity
+    // (e.g. one can distinguish more shades between red and yellow
+    //  than between yellow and green)
+    const int RY = 15;
+    const int YG = 6;
+    const int GC = 4;
+    const int CB = 11;
+    const int BM = 13;
+    const int MR = 6;
+    const int NCOLS = RY + YG + GC + CB + BM + MR;
+    static Vec3i colorWheel[NCOLS];
+
+    if (first)
+    {
+        int k = 0;
+
+        for (int i = 0; i < RY; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 255 * i / RY, 0);
+
+        for (int i = 0; i < YG; ++i, ++k)
+            colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0);
+
+        for (int i = 0; i < GC; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255, 255 * i / GC);
+
+        for (int i = 0; i < CB; ++i, ++k)
+            colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255);
+
+        for (int i = 0; i < BM; ++i, ++k)
+            colorWheel[k] = Vec3i(255 * i / BM, 0, 255);
+
+        for (int i = 0; i < MR; ++i, ++k)
+            colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR);
+
+        first = false;
+    }
+
+    const float rad = sqrt(fx * fx + fy * fy);
+    const float a = atan2(-fy, -fx) / (float) CV_PI;
+
+    const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1);
+    const int k0 = static_cast<int>(fk);
+    const int k1 = (k0 + 1) % NCOLS;
+    const float f = fk - k0;
+
+    Vec3b pix;
+
+    for (int b = 0; b < 3; b++)
+    {
+        const float col0 = colorWheel[k0][b] / 255.0f;
+        const float col1 = colorWheel[k1][b] / 255.0f;
+
+        float col = (1 - f) * col0 + f * col1;
+
+        if (rad <= 1)
+            col = 1 - rad * (1 - col); // increase saturation with radius
+        else
+            col *= .75; // out of range
+
+        pix[2 - b] = static_cast<uchar>(255.0 * col);
+    }
+
+    return pix;
+}
+
+static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy, Mat& dst, float maxmotion = -1)
+{
+    dst.create(flowx.size(), CV_8UC3);
+    dst.setTo(Scalar::all(0));
+
+    // determine motion range:
+    float maxrad = maxmotion;
+
+    if (maxmotion <= 0)
+    {
+        maxrad = 1;
+        for (int y = 0; y < flowx.rows; ++y)
+        {
+            for (int x = 0; x < flowx.cols; ++x)
+            {
+                Point2f u(flowx(y, x), flowy(y, x));
+
+                if (!isFlowCorrect(u))
+                    continue;
+
+                maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y));
+            }
+        }
+    }
+
+    for (int y = 0; y < flowx.rows; ++y)
+    {
+        for (int x = 0; x < flowx.cols; ++x)
+        {
+            Point2f u(flowx(y, x), flowy(y, x));
+
+            if (isFlowCorrect(u))
+                dst.at<Vec3b>(y, x) = computeColor(u.x / maxrad, u.y / maxrad);
+        }
+    }
+}
+
+static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_flowy)
+{
+    Mat flowx(d_flowx);
+    Mat flowy(d_flowy);
+
+    Mat out;
+    drawOpticalFlow(flowx, flowy, out, 10);
+
+    imshow(name, out);
+}
+
+int main(int argc, const char* argv[])
+{
+    if (argc < 3)
+    {
+        cerr << "Usage : " << argv[0] << "<frame0> <frame1>" << endl;
+        return -1;
+    }
+
+    Mat frame0 = imread(argv[1], IMREAD_GRAYSCALE);
+    Mat frame1 = imread(argv[2], IMREAD_GRAYSCALE);
+
+    if (frame0.empty())
+    {
+        cerr << "Can't open image ["  << argv[1] << "]" << endl;
+        return -1;
+    }
+    if (frame1.empty())
+    {
+        cerr << "Can't open image ["  << argv[2] << "]" << endl;
+        return -1;
+    }
+
+    if (frame1.size() != frame0.size())
+    {
+        cerr << "Images should be of equal sizes" << endl;
+        return -1;
+    }
+
+    GpuMat d_frame0(frame0);
+    GpuMat d_frame1(frame1);
+
+    GpuMat d_flowx(frame0.size(), CV_32FC1);
+    GpuMat d_flowy(frame0.size(), CV_32FC1);
+
+    BroxOpticalFlow brox(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    PyrLKOpticalFlow lk; lk.winSize = Size(7, 7);
+    FarnebackOpticalFlow farn;
+    OpticalFlowDual_TVL1_GPU tvl1;
+    FastOpticalFlowBM fastBM;
+
+    {
+        GpuMat d_frame0f;
+        GpuMat d_frame1f;
+
+        d_frame0.convertTo(d_frame0f, CV_32F, 1.0 / 255.0);
+        d_frame1.convertTo(d_frame1f, CV_32F, 1.0 / 255.0);
+
+        const int64 start = getTickCount();
+
+        brox(d_frame0f, d_frame1f, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Brox : " << timeSec << " sec" << endl;
+
+        showFlow("Brox", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        lk.dense(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "LK : " << timeSec << " sec" << endl;
+
+        showFlow("LK", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        farn(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Farn : " << timeSec << " sec" << endl;
+
+        showFlow("Farn", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        tvl1(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "TVL1 : " << timeSec << " sec" << endl;
+
+        showFlow("TVL1", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        GpuMat buf;
+        calcOpticalFlowBM(d_frame0, d_frame1, Size(7, 7), Size(1, 1), Size(21, 21), false, d_flowx, d_flowy, buf);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "BM : " << timeSec << " sec" << endl;
+
+        showFlow("BM", d_flowx, d_flowy);
+    }
+
+    {
+        const int64 start = getTickCount();
+
+        fastBM(d_frame0, d_frame1, d_flowx, d_flowy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "Fast BM : " << timeSec << " sec" << endl;
+
+        showFlow("Fast BM", d_flowx, d_flowy);
+    }
+
+    imshow("Frame 0", frame0);
+    imshow("Frame 1", frame1);
+    waitKey();
+
+    return 0;
+}