Merge pull request #3635 from jet47:cuda-optflow-refactoring

2025-07-24 14:06:27 +08:00 · 2015-01-22 09:45:19 +00:00 · 2015-01-22 09:45:19 +00:00 · 3f1fb281be
commit 3f1fb281be
parent 9c81338cb9 710617034b
27 changed files with 1577 additions and 1858 deletions
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@ -71,8 +71,9 @@ public:

 CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());

-////////////////////////////////////////////////////
+//
 // GMG
+//

 /** @brief Background/Foreground Segmentation Algorithm.

@ -125,8 +126,9 @@ public:
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
    createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);

-////////////////////////////////////////////////////
+//
 // FGD
+//

 /** @brief The class discriminates between foreground and background pixels by building and maintaining a model
 of the background.
@ -180,6 +182,51 @@ struct CV_EXPORTS FGDParams
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
    createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());

+//
+// Optical flow
+//
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
+};
+
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
+                                  const GpuMat& fu, const GpuMat& fv,
+                                  const GpuMat& bu, const GpuMat& bv,
+                                  float pos, GpuMat& newFrame, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
+
 //! @}

 }}
--- a/modules/cudaoptflow/src/bm.cpp
+++ b/modules/cudaoptflow/src/bm.cpp
--- a/modules/cudaoptflow/src/bm_fast.cpp
+++ b/modules/cudaoptflow/src/bm_fast.cpp
--- a/modules/cudaoptflow/src/cuda/bm.cu
+++ b/modules/cudaoptflow/src/cuda/bm.cu
--- a/modules/cudaoptflow/src/cuda/bm_fast.cu
+++ b/modules/cudaoptflow/src/cuda/bm_fast.cu
--- a/modules/cudaoptflow/src/cuda/needle_map.cu
+++ b/modules/cudaoptflow/src/cuda/needle_map.cu
--- a/modules/cudaoptflow/src/interpolate_frames.cpp
+++ b/modules/cudaoptflow/src/interpolate_frames.cpp
--- a/modules/cudaoptflow/src/needle_map.cpp
+++ b/modules/cudaoptflow/src/needle_map.cpp
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@ -61,49 +61,94 @@ namespace cv { namespace cuda {
 //! @addtogroup cudaoptflow
 //! @{

-/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
-(@cite Brox2004). :
+//
+// Interface
+//
+
+/** @brief Base interface for dense optical flow algorithms.
 */
-class CV_EXPORTS BroxOpticalFlow
+class CV_EXPORTS DenseOpticalFlow : public Algorithm
 {
 public:
-    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
-        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
-        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
-    {
-    }
+    /** @brief Calculates a dense optical flow.

-    //! Compute optical flow
-    //! frame0 - source frame (supports only CV_32FC1 type)
-    //! frame1 - frame to track (with the same size and type as frame0)
-    //! u      - flow horizontal component (along x axis)
-    //! v      - flow vertical component (along y axis)
-    void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
-
-    //! flow smoothness
-    float alpha;
-
-    //! gradient constancy importance
-    float gamma;
-
-    //! pyramid scale factor
-    float scale_factor;
-
-    //! number of lagged non-linearity iterations (inner loop)
-    int inner_iterations;
-
-    //! number of warping iterations (number of pyramid levels)
-    int outer_iterations;
-
-    //! number of linear system solver iterations
-    int solver_iterations;
-
-    GpuMat buf;
+    @param I0 first input image.
+    @param I1 second input image of the same size and the same type as I0.
+    @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0;
 };

-/** @brief Class used for calculating an optical flow.
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.

-The class can calculate an optical flow for a sparse feature set or dense optical flow using the
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray(),
+                      Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// BroxOpticalFlow
+//
+
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004).
+ */
+class CV_EXPORTS BroxOpticalFlow : public DenseOpticalFlow
+{
+public:
+    virtual double getFlowSmoothness() const = 0;
+    virtual void setFlowSmoothness(double alpha) = 0;
+
+    virtual double getGradientConstancyImportance() const = 0;
+    virtual void setGradientConstancyImportance(double gamma) = 0;
+
+    virtual double getPyramidScaleFactor() const = 0;
+    virtual void setPyramidScaleFactor(double scale_factor) = 0;
+
+    //! number of lagged non-linearity iterations (inner loop)
+    virtual int getInnerIterations() const = 0;
+    virtual void setInnerIterations(int inner_iterations) = 0;
+
+    //! number of warping iterations (number of pyramid levels)
+    virtual int getOuterIterations() const = 0;
+    virtual void setOuterIterations(int outer_iterations) = 0;
+
+    //! number of linear system solver iterations
+    virtual int getSolverIterations() const = 0;
+    virtual void setSolverIterations(int solver_iterations) = 0;
+
+    static Ptr<BroxOpticalFlow> create(
+            double alpha = 0.197,
+            double gamma = 50.0,
+            double scale_factor = 0.8,
+            int inner_iterations = 5,
+            int outer_iterations = 150,
+            int solver_iterations = 10);
+};
+
+//
+// PyrLKOpticalFlow
+//
+
+/** @brief Class used for calculating a sparse optical flow.
+
+The class can calculate an optical flow for a sparse feature set using the
 iterative Lucas-Kanade method with pyramids.

@sa calcOpticalFlowPyrLK
@ -112,158 +157,116 @@ iterative Lucas-Kanade method with pyramids.
   -   An example of the Lucas Kanade optical flow algorithm can be found at
        opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp
 */
-class CV_EXPORTS PyrLKOpticalFlow
+class CV_EXPORTS SparsePyrLKOpticalFlow : public SparseOpticalFlow
 {
 public:
-    PyrLKOpticalFlow();
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;

-    /** @brief Calculate an optical flow for a sparse feature set.
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;

-    @param prevImg First 8-bit input image (supports both grayscale and color images).
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
-    with CV_32FC2 type.
-    @param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
-    containing the calculated new positions of input features in the second image. When useInitialFlow
-    is true, the vector must have the same size as in the input.
-    @param status Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the
-    flow for the corresponding features has been found. Otherwise, it is set to 0.
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;

-    @sa calcOpticalFlowPyrLK
-     */
-    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
-        GpuMat& status, GpuMat* err = 0);
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;

-    /** @brief Calculate dense optical flow.
-
-    @param prevImg First 8-bit grayscale input image.
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param u Horizontal component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param v Vertical component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
-     */
-    void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
-
-    /** @brief Releases inner buffers memory.
-    */
-    void releaseMemory();
-
-    Size winSize;
-    int maxLevel;
-    int iters;
-    bool useInitialFlow;
-
-private:
-    std::vector<GpuMat> prevPyr_;
-    std::vector<GpuMat> nextPyr_;
-
-    GpuMat buf_;
-
-    GpuMat uPyr_[2];
-    GpuMat vPyr_[2];
+    static Ptr<SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };

-/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. :
+/** @brief Class used for calculating a dense optical flow.
+
+The class can calculate an optical flow for a dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
 */
-class CV_EXPORTS FarnebackOpticalFlow
+class CV_EXPORTS DensePyrLKOpticalFlow : public DenseOpticalFlow
 {
 public:
-    FarnebackOpticalFlow()
-    {
-        numLevels = 5;
-        pyrScale = 0.5;
-        fastPyramids = false;
-        winSize = 13;
-        numIters = 10;
-        polyN = 5;
-        polySigma = 1.1;
-        flags = 0;
-    }
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;

-    int numLevels;
-    double pyrScale;
-    bool fastPyramids;
-    int winSize;
-    int numIters;
-    int polyN;
-    double polySigma;
-    int flags;
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;

-    /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm.
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;

-    @param frame0 First 8-bit gray-scale input image
-    @param frame1 Second 8-bit gray-scale input image
-    @param flowx Flow horizontal component
-    @param flowy Flow vertical component
-    @param s Stream
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;

-    @sa calcOpticalFlowFarneback
-     */
-    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
-
-    /** @brief Releases unused auxiliary memory buffers.
-     */
-    void releaseMemory()
-    {
-        frames_[0].release();
-        frames_[1].release();
-        pyrLevel_[0].release();
-        pyrLevel_[1].release();
-        M_.release();
-        bufM_.release();
-        R_[0].release();
-        R_[1].release();
-        blurredFrame_[0].release();
-        blurredFrame_[1].release();
-        pyramid0_.clear();
-        pyramid1_.clear();
-    }
-
-private:
-    void prepareGaussian(
-            int n, double sigma, float *g, float *xg, float *xxg,
-            double &ig11, double &ig03, double &ig33, double &ig55);
-
-    void setPolynomialExpansionConsts(int n, double sigma);
-
-    void updateFlow_boxFilter(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    void updateFlow_gaussianBlur(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    GpuMat frames_[2];
-    GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
-    std::vector<GpuMat> pyramid0_, pyramid1_;
+    static Ptr<DensePyrLKOpticalFlow> create(
+            Size winSize = Size(13, 13),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };

-// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 //
-// see reference:
-//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-class CV_EXPORTS OpticalFlowDual_TVL1_CUDA
+// FarnebackOpticalFlow
+//
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm.
+ */
+class CV_EXPORTS FarnebackOpticalFlow : public DenseOpticalFlow
 {
 public:
-    OpticalFlowDual_TVL1_CUDA();
+    virtual int getNumLevels() const = 0;
+    virtual void setNumLevels(int numLevels) = 0;

-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
+    virtual double getPyrScale() const = 0;
+    virtual void setPyrScale(double pyrScale) = 0;

-    void collectGarbage();
+    virtual bool getFastPyramids() const = 0;
+    virtual void setFastPyramids(bool fastPyramids) = 0;

+    virtual int getWinSize() const = 0;
+    virtual void setWinSize(int winSize) = 0;
+
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int numIters) = 0;
+
+    virtual int getPolyN() const = 0;
+    virtual void setPolyN(int polyN) = 0;
+
+    virtual double getPolySigma() const = 0;
+    virtual void setPolySigma(double polySigma) = 0;
+
+    virtual int getFlags() const = 0;
+    virtual void setFlags(int flags) = 0;
+
+    static Ptr<FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+//
+// OpticalFlowDual_TVL1
+//
+
+/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
+ *
+ * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ */
+class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
    /**
     * Time step of the numerical scheme.
     */
-    double tau;
+    virtual double getTau() const = 0;
+    virtual void setTau(double tau) = 0;

    /**
     * Weight parameter for the data term, attachment parameter.
@ -271,7 +274,8 @@ public:
     * The smaller this parameter is, the smoother the solutions we obtain.
     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
     */
-    double lambda;
+    virtual double getLambda() const = 0;
+    virtual void setLambda(double lambda) = 0;

    /**
     * Weight parameter for (u - v)^2, tightness parameter.
@ -279,20 +283,23 @@ public:
     * In theory, it should have a small value in order to maintain both parts in correspondence.
     * The method is stable for a large range of values of this parameter.
     */
+    virtual double getGamma() const = 0;
+    virtual void setGamma(double gamma) = 0;

-    double gamma;
    /**
-    * parameter used for motion estimation. It adds a variable allowing for illumination variations
-    * Set this parameter to 1. if you have varying illumination.
-    * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
-    * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
-    */
-    double theta;
+     * parameter used for motion estimation. It adds a variable allowing for illumination variations
+     * Set this parameter to 1. if you have varying illumination.
+     * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
+     * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
+     */
+    virtual double getTheta() const = 0;
+    virtual void setTheta(double theta) = 0;

    /**
     * Number of scales used to create the pyramid of images.
     */
-    int nscales;
+    virtual int getNumScales() const = 0;
+    virtual void setNumScales(int nscales) = 0;

    /**
     * Number of warpings per scale.
@ -300,94 +307,41 @@ public:
     * This is a parameter that assures the stability of the method.
     * It also affects the running time, so it is a compromise between speed and accuracy.
     */
-    int warps;
+    virtual int getNumWarps() const = 0;
+    virtual void setNumWarps(int warps) = 0;

    /**
     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
     * A small value will yield more accurate solutions at the expense of a slower convergence.
     */
-    double epsilon;
+    virtual double getEpsilon() const = 0;
+    virtual void setEpsilon(double epsilon) = 0;

    /**
     * Stopping criterion iterations number used in the numerical scheme.
     */
-    int iterations;
+    virtual int getNumIterations() const = 0;
+    virtual void setNumIterations(int iterations) = 0;

-    double scaleStep;
+    virtual double getScaleStep() const = 0;
+    virtual void setScaleStep(double scaleStep) = 0;

-    bool useInitialFlow;
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;

-private:
-    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3);
-
-    std::vector<GpuMat> I0s;
-    std::vector<GpuMat> I1s;
-    std::vector<GpuMat> u1s;
-    std::vector<GpuMat> u2s;
-    std::vector<GpuMat> u3s;
-
-    GpuMat I1x_buf;
-    GpuMat I1y_buf;
-
-    GpuMat I1w_buf;
-    GpuMat I1wx_buf;
-    GpuMat I1wy_buf;
-
-    GpuMat grad_buf;
-    GpuMat rho_c_buf;
-
-    GpuMat p11_buf;
-    GpuMat p12_buf;
-    GpuMat p21_buf;
-    GpuMat p22_buf;
-    GpuMat p31_buf;
-    GpuMat p32_buf;
-
-    GpuMat diff_buf;
-    GpuMat norm_buf;
+    static Ptr<OpticalFlowDual_TVL1> create(
+            double tau = 0.25,
+            double lambda = 0.15,
+            double theta = 0.3,
+            int nscales = 5,
+            int warps = 5,
+            double epsilon = 0.01,
+            int iterations = 300,
+            double scaleStep = 0.8,
+            double gamma = 0.0,
+            bool useInitialFlow = false);
 };

-//! Calculates optical flow for 2 images using block matching algorithm */
-CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
-                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
-                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-class CV_EXPORTS FastOpticalFlowBM
-{
-public:
-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
-
-private:
-    GpuMat buffer;
-    GpuMat extended_I0;
-    GpuMat extended_I1;
-};
-
-/** @brief Interpolates frames (images) using provided optical flow (displacement field).
-
-@param frame0 First frame (32-bit floating point images, single channel).
-@param frame1 Second frame. Must have the same type and size as frame0 .
-@param fu Forward horizontal displacement.
-@param fv Forward vertical displacement.
-@param bu Backward horizontal displacement.
-@param bv Backward vertical displacement.
-@param pos New frame position.
-@param newFrame Output image.
-@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
-GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
-horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
-interpolated backward vertical flow.
-@param stream Stream for the asynchronous version.
- */
-CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
-                                  const GpuMat& fu, const GpuMat& fv,
-                                  const GpuMat& bu, const GpuMat& bv,
-                                  float pos, GpuMat& newFrame, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
-
 //! @}

 }} // namespace cv { namespace cuda {
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@ -46,91 +46,10 @@ using namespace std;
 using namespace testing;
 using namespace perf;

-//////////////////////////////////////////////////////
-// InterpolateFrames
-
 typedef pair<string, string> pair_string;

 DEF_PARAM_TEST_1(ImagePair, pair_string);

-PERF_TEST_P(ImagePair, InterpolateFrames,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    frame0.convertTo(frame0, CV_32FC1, 1.0 / 255.0);
-    frame1.convertTo(frame1, CV_32FC1, 1.0 / 255.0);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat d_fu, d_fv;
-        cv::cuda::GpuMat d_bu, d_bv;
-
-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-            10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        cv::cuda::GpuMat newFrame;
-        cv::cuda::GpuMat d_buf;
-
-        TEST_CYCLE() cv::cuda::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, newFrame, d_buf);
-
-        CUDA_SANITY_CHECK(newFrame, 1e-4);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////
-// CreateOpticalFlowNeedleMap
-
-PERF_TEST_P(ImagePair, CreateOpticalFlowNeedleMap,
-            Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    frame0.convertTo(frame0, CV_32FC1, 1.0 / 255.0);
-    frame1.convertTo(frame1, CV_32FC1, 1.0 / 255.0);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
-
-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                        10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-        d_flow(d_frame0, d_frame1, u, v);
-
-        cv::cuda::GpuMat vertex, colors;
-
-        TEST_CYCLE() cv::cuda::createOpticalFlowNeedleMap(u, v, vertex, colors);
-
-        CUDA_SANITY_CHECK(vertex, 1e-6);
-        CUDA_SANITY_CHECK(colors);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
 //////////////////////////////////////////////////////
 // BroxOpticalFlow

@ -152,13 +71,19 @@ PERF_TEST_P(ImagePair, BroxOpticalFlow,
    {
        const cv::cuda::GpuMat d_frame0(frame0);
        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;

-        cv::cuda::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                        10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+        cv::Ptr<cv::cuda::BroxOpticalFlow> d_alg =
+                cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);

-        TEST_CYCLE() d_flow(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];

        CUDA_SANITY_CHECK(u, 1e-1);
        CUDA_SANITY_CHECK(v, 1e-1);
@ -210,17 +135,17 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
    {
        const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));

-        cv::cuda::PyrLKOpticalFlow d_pyrLK;
-        d_pyrLK.winSize = cv::Size(winSize, winSize);
-        d_pyrLK.maxLevel = levels - 1;
-        d_pyrLK.iters = iters;
+        cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::SparsePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                         levels - 1,
+                                                         iters);

        const cv::cuda::GpuMat d_frame0(frame0);
        const cv::cuda::GpuMat d_frame1(frame1);
        cv::cuda::GpuMat nextPts;
        cv::cuda::GpuMat status;

-        TEST_CYCLE() d_pyrLK.sparse(d_frame0, d_frame1, d_pts, nextPts, status);
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, d_pts, nextPts, status);

        CUDA_SANITY_CHECK(nextPts);
        CUDA_SANITY_CHECK(status);
@ -270,15 +195,20 @@ PERF_TEST_P(ImagePair_WinSz_Levels_Iters, PyrLKOpticalFlowDense,
    {
        const cv::cuda::GpuMat d_frame0(frame0);
        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;

-        cv::cuda::PyrLKOpticalFlow d_pyrLK;
-        d_pyrLK.winSize = cv::Size(winSize, winSize);
-        d_pyrLK.maxLevel = levels - 1;
-        d_pyrLK.iters = iters;
+        cv::Ptr<cv::cuda::DensePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::DensePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                        levels - 1,
+                                                        iters);

-        TEST_CYCLE() d_pyrLK.dense(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];

        CUDA_SANITY_CHECK(u);
        CUDA_SANITY_CHECK(v);
@ -315,19 +245,19 @@ PERF_TEST_P(ImagePair, FarnebackOpticalFlow,
    {
        const cv::cuda::GpuMat d_frame0(frame0);
        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;

-        cv::cuda::FarnebackOpticalFlow d_farneback;
-        d_farneback.numLevels = numLevels;
-        d_farneback.pyrScale = pyrScale;
-        d_farneback.winSize = winSize;
-        d_farneback.numIters = numIters;
-        d_farneback.polyN = polyN;
-        d_farneback.polySigma = polySigma;
-        d_farneback.flags = flags;
+        cv::Ptr<cv::cuda::FarnebackOpticalFlow> d_farneback =
+                cv::cuda::FarnebackOpticalFlow::create(numLevels, pyrScale, false, winSize,
+                                                       numIters, polyN, polySigma, flags);

-        TEST_CYCLE() d_farneback(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_farneback->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];

        CUDA_SANITY_CHECK(u, 1e-4);
        CUDA_SANITY_CHECK(v, 1e-4);
@ -360,12 +290,18 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
    {
        const cv::cuda::GpuMat d_frame0(frame0);
        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u;
-        cv::cuda::GpuMat v;
+        cv::cuda::GpuMat flow;

-        cv::cuda::OpticalFlowDual_TVL1_CUDA d_alg;
+        cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+                cv::cuda::OpticalFlowDual_TVL1::create();

-        TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];

        CUDA_SANITY_CHECK(u, 1e-1);
        CUDA_SANITY_CHECK(v, 1e-1);
@ -383,72 +319,3 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
        CPU_SANITY_CHECK(flow);
    }
 }
-
-//////////////////////////////////////////////////////
-// OpticalFlowBM
-
-PERF_TEST_P(ImagePair, OpticalFlowBM,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    declare.time(400);
-
-    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    const cv::Size block_size(16, 16);
-    const cv::Size shift_size(1, 1);
-    const cv::Size max_range(16, 16);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u, v, buf;
-
-        TEST_CYCLE() cv::cuda::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, u, v, buf);
-
-        CUDA_SANITY_CHECK(u);
-        CUDA_SANITY_CHECK(v);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-PERF_TEST_P(ImagePair, DISABLED_FastOpticalFlowBM,
-    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
-{
-    declare.time(400);
-
-    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    const cv::Size block_size(16, 16);
-    const cv::Size shift_size(1, 1);
-    const cv::Size max_range(16, 16);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_frame0(frame0);
-        const cv::cuda::GpuMat d_frame1(frame1);
-        cv::cuda::GpuMat u, v;
-
-        cv::cuda::FastOpticalFlowBM fastBM;
-
-        TEST_CYCLE() fastBM(d_frame0, d_frame1, u, v, max_range.width, block_size.width);
-
-        CUDA_SANITY_CHECK(u, 2);
-        CUDA_SANITY_CHECK(v, 2);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
--- a/modules/cudaoptflow/perf/perf_precomp.hpp
+++ b/modules/cudaoptflow/perf/perf_precomp.hpp
@ -55,6 +55,7 @@
 #include "opencv2/ts/cuda_perf.hpp"

 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@ -47,84 +47,148 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }

 #else

-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));

        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
+
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }
+
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();
+
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );
+
+        ncvSetDebugOutputHandler(outputHandler);
+
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;
+
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;
+
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;
+
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;
+
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;
+
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();
+
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
+
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());
+
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );
+
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
 }

-namespace
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
 {
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
-
-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
-
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-
-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
-
-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
-
-    NCVBroxOpticalFlowDescriptor desc;
-
-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
-
-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
-
-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
-
-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
-
-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
-
-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
-
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
-
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-
-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
 }

 #endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@ -472,16 +472,16 @@ namespace pyrlk
        }
    }

-    void loadConstants(int2 winSize, int iters)
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    }

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@ -66,15 +66,16 @@ namespace tvl1flow
        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
    }

-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@ -164,7 +165,10 @@ namespace tvl1flow
        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
    }

-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@ -173,10 +177,11 @@ namespace tvl1flow
        bindTexture(&tex_I1x, I1x);
        bindTexture(&tex_I1y, I1y);

-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@ -292,15 +297,17 @@ namespace tvl1flow
                   PtrStepSzf grad, PtrStepSzf rho_c,
                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError)
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));

-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@ -346,15 +353,19 @@ namespace tvl1flow
        }
    }

-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));

-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@ -42,23 +42,21 @@

 #include "precomp.hpp"

-#define MIN_SIZE 32
-
-#define S(x) StreamAccessor::getStream(x)
-
-// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
-// leads to an inefficient code. It's for debug purposes only.
-#define ENABLE_CUDA_RESIZE 1
-
 using namespace cv;
 using namespace cv::cuda;

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::cuda::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int, double, bool, int, int, int, double, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }

 #else

+#define MIN_SIZE 32
+
+// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
+// leads to an inefficient code. It's for debug purposes only.
+#define ENABLE_CUDA_RESIZE 1
+
 namespace cv { namespace cuda { namespace device { namespace optflow_farneback
 {
    void setPolynomialExpansionConsts(
@ -76,8 +74,6 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
    void updateFlowGpu(
            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);

-    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
-
    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);

    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
@ -93,10 +89,93 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
    void gaussianBlur5Gpu_CC11(
            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);

-}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
+}}}}

 namespace
 {
+    class FarnebackOpticalFlowImpl : public FarnebackOpticalFlow
+    {
+    public:
+        FarnebackOpticalFlowImpl(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                 int numIters, int polyN, double polySigma, int flags) :
+            numLevels_(numLevels), pyrScale_(pyrScale), fastPyramids_(fastPyramids), winSize_(winSize),
+            numIters_(numIters), polyN_(polyN), polySigma_(polySigma), flags_(flags)
+        {
+        }
+
+        virtual int getNumLevels() const { return numLevels_; }
+        virtual void setNumLevels(int numLevels) { numLevels_ = numLevels; }
+
+        virtual double getPyrScale() const { return pyrScale_; }
+        virtual void setPyrScale(double pyrScale) { pyrScale_ = pyrScale; }
+
+        virtual bool getFastPyramids() const { return fastPyramids_; }
+        virtual void setFastPyramids(bool fastPyramids) { fastPyramids_ = fastPyramids; }
+
+        virtual int getWinSize() const { return winSize_; }
+        virtual void setWinSize(int winSize) { winSize_ = winSize; }
+
+        virtual int getNumIters() const { return numIters_; }
+        virtual void setNumIters(int numIters) { numIters_ = numIters; }
+
+        virtual int getPolyN() const { return polyN_; }
+        virtual void setPolyN(int polyN) { polyN_ = polyN; }
+
+        virtual double getPolySigma() const { return polySigma_; }
+        virtual void setPolySigma(double polySigma) { polySigma_ = polySigma; }
+
+        virtual int getFlags() const { return flags_; }
+        virtual void setFlags(int flags) { flags_ = flags; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+    private:
+        int numLevels_;
+        double pyrScale_;
+        bool fastPyramids_;
+        int winSize_;
+        int numIters_;
+        int polyN_;
+        double polySigma_;
+        int flags_;
+
+    private:
+        void prepareGaussian(
+                int n, double sigma, float *g, float *xg, float *xxg,
+                double &ig11, double &ig03, double &ig33, double &ig55);
+
+        void setPolynomialExpansionConsts(int n, double sigma);
+
+        void updateFlow_boxFilter(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+        void updateFlow_gaussianBlur(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+        void calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream);
+
+        GpuMat frames_[2];
+        GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
+        std::vector<GpuMat> pyramid0_, pyramid1_;
+    };
+
+    void FarnebackOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+
    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
    {
        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
@ -104,285 +183,287 @@ namespace

        return mat = GpuMat(rows, cols, type);
    }
-}

-void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
-        int n, double sigma, float *g, float *xg, float *xxg,
-        double &ig11, double &ig03, double &ig33, double &ig55)
-{
-    double s = 0.;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
-        s += g[x];
-    }
-
-    s = 1./s;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)(g[x]*s);
-        xg[x] = (float)(x*g[x]);
-        xxg[x] = (float)(x*x*g[x]);
-    }
-
-    Mat_<double> G(6, 6);
-    G.setTo(0);
-
-    for (int y = -n; y <= n; y++)
+    void FarnebackOpticalFlowImpl::prepareGaussian(
+            int n, double sigma, float *g, float *xg, float *xxg,
+            double &ig11, double &ig03, double &ig33, double &ig55)
    {
+        double s = 0.;
        for (int x = -n; x <= n; x++)
        {
-            G(0,0) += g[y]*g[x];
-            G(1,1) += g[y]*g[x]*x*x;
-            G(3,3) += g[y]*g[x]*x*x*x*x;
-            G(5,5) += g[y]*g[x]*x*x*y*y;
-        }
-    }
-
-    //G[0][0] = 1.;
-    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
-    G(4,4) = G(3,3);
-    G(3,4) = G(4,3) = G(5,5);
-
-    // invG:
-    // [ x        e  e    ]
-    // [    y             ]
-    // [       y          ]
-    // [ e        z       ]
-    // [ e           z    ]
-    // [                u ]
-    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-
-    ig11 = invG(1,1);
-    ig03 = invG(0,3);
-    ig33 = invG(3,3);
-    ig55 = invG(5,5);
-}
-
-
-void cv::cuda::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
-{
-    std::vector<float> buf(n*6 + 3);
-    float* g = &buf[0] + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
-
-    if (sigma < FLT_EPSILON)
-        sigma = n*0.3;
-
-    double ig11, ig03, ig33, ig55;
-    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
-
-    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
-}
-
-
-void cv::cuda::FarnebackOpticalFlow::updateFlow_boxFilter(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
-    else
-        device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
-    swap(M, bufM);
-
-    for (int i = 1; i < 5; ++i)
-        streams[i].waitForCompletion();
-    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::cuda::FarnebackOpticalFlow::updateFlow_gaussianBlur(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        device::optflow_farneback::gaussianBlur5Gpu(
-                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
-    else
-        device::optflow_farneback::gaussianBlur5Gpu_CC11(
-                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
-    swap(M, bufM);
-
-    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::cuda::FarnebackOpticalFlow::operator ()(
-        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
-{
-    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
-    CV_Assert(frame0.size() == frame1.size());
-    CV_Assert(polyN == 5 || polyN == 7);
-    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
-
-    Stream streams[5];
-    if (S(s))
-        streams[0] = s;
-
-    Size size = frame0.size();
-    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
-
-    flowx.create(size, CV_32F);
-    flowy.create(size, CV_32F);
-    GpuMat flowx0 = flowx;
-    GpuMat flowy0 = flowy;
-
-    // Crop unnecessary levels
-    double scale = 1;
-    int numLevelsCropped = 0;
-    for (; numLevelsCropped < numLevels; numLevelsCropped++)
-    {
-        scale *= pyrScale;
-        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
-            break;
-    }
-
-    frame0.convertTo(frames_[0], CV_32F, streams[0]);
-    frame1.convertTo(frames_[1], CV_32F, streams[1]);
-
-    if (fastPyramids)
-    {
-        // Build Gaussian pyramids using pyrDown()
-        pyramid0_.resize(numLevelsCropped + 1);
-        pyramid1_.resize(numLevelsCropped + 1);
-        pyramid0_[0] = frames_[0];
-        pyramid1_[0] = frames_[1];
-        for (int i = 1; i <= numLevelsCropped; ++i)
-        {
-            cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
-            cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
-        }
-    }
-
-    setPolynomialExpansionConsts(polyN, polySigma);
-    device::optflow_farneback::setUpdateMatricesConsts();
-
-    for (int k = numLevelsCropped; k >= 0; k--)
-    {
-        streams[0].waitForCompletion();
-
-        scale = 1;
-        for (int i = 0; i < k; i++)
-            scale *= pyrScale;
-
-        double sigma = (1./scale - 1) * 0.5;
-        int smoothSize = cvRound(sigma*5) | 1;
-        smoothSize = std::max(smoothSize, 3);
-
-        int width = cvRound(size.width*scale);
-        int height = cvRound(size.height*scale);
-
-        if (fastPyramids)
-        {
-            width = pyramid0_[k].cols;
-            height = pyramid0_[k].rows;
+            g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
+            s += g[x];
        }

-        if (k > 0)
+        s = 1./s;
+        for (int x = -n; x <= n; x++)
        {
-            curFlowX.create(height, width, CV_32F);
-            curFlowY.create(height, width, CV_32F);
-        }
-        else
-        {
-            curFlowX = flowx0;
-            curFlowY = flowy0;
+            g[x] = (float)(g[x]*s);
+            xg[x] = (float)(x*g[x]);
+            xxg[x] = (float)(x*x*g[x]);
        }

-        if (!prevFlowX.data)
+        Mat_<double> G(6, 6);
+        G.setTo(0);
+
+        for (int y = -n; y <= n; y++)
        {
-            if (flags & OPTFLOW_USE_INITIAL_FLOW)
+            for (int x = -n; x <= n; x++)
            {
-                cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-                cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
-                curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+                G(0,0) += g[y]*g[x];
+                G(1,1) += g[y]*g[x]*x*x;
+                G(3,3) += g[y]*g[x]*x*x*x*x;
+                G(5,5) += g[y]*g[x]*x*x*y*y;
+            }
+        }
+
+        //G[0][0] = 1.;
+        G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
+        G(4,4) = G(3,3);
+        G(3,4) = G(4,3) = G(5,5);
+
+        // invG:
+        // [ x        e  e    ]
+        // [    y             ]
+        // [       y          ]
+        // [ e        z       ]
+        // [ e           z    ]
+        // [                u ]
+        Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
+
+        ig11 = invG(1,1);
+        ig03 = invG(0,3);
+        ig33 = invG(3,3);
+        ig55 = invG(5,5);
+    }
+
+    void FarnebackOpticalFlowImpl::setPolynomialExpansionConsts(int n, double sigma)
+    {
+        std::vector<float> buf(n*6 + 3);
+        float* g = &buf[0] + n;
+        float* xg = g + n*2 + 1;
+        float* xxg = xg + n*2 + 1;
+
+        if (sigma < FLT_EPSILON)
+            sigma = n*0.3;
+
+        double ig11, ig03, ig33, ig55;
+        prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
+
+        device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
+    }
+
+    void FarnebackOpticalFlowImpl::updateFlow_boxFilter(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
+
+        for (int i = 1; i < 5; ++i)
+            streams[i].waitForCompletion();
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
+
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
+
+    void FarnebackOpticalFlowImpl::updateFlow_gaussianBlur(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::gaussianBlur5Gpu(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::gaussianBlur5Gpu_CC11(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
+
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
+
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
+
+    void FarnebackOpticalFlowImpl::calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream)
+    {
+        CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
+        CV_Assert(frame0.size() == frame1.size());
+        CV_Assert(polyN_ == 5 || polyN_ == 7);
+        CV_Assert(!fastPyramids_ || std::abs(pyrScale_ - 0.5) < 1e-6);
+
+        Stream streams[5];
+        if (stream)
+            streams[0] = stream;
+
+        Size size = frame0.size();
+        GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
+
+        flowx.create(size, CV_32F);
+        flowy.create(size, CV_32F);
+        GpuMat flowx0 = flowx;
+        GpuMat flowy0 = flowy;
+
+        // Crop unnecessary levels
+        double scale = 1;
+        int numLevelsCropped = 0;
+        for (; numLevelsCropped < numLevels_; numLevelsCropped++)
+        {
+            scale *= pyrScale_;
+            if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
+                break;
+        }
+
+        frame0.convertTo(frames_[0], CV_32F, streams[0]);
+        frame1.convertTo(frames_[1], CV_32F, streams[1]);
+
+        if (fastPyramids_)
+        {
+            // Build Gaussian pyramids using pyrDown()
+            pyramid0_.resize(numLevelsCropped + 1);
+            pyramid1_.resize(numLevelsCropped + 1);
+            pyramid0_[0] = frames_[0];
+            pyramid1_[0] = frames_[1];
+            for (int i = 1; i <= numLevelsCropped; ++i)
+            {
+                cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
+                cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
+            }
+        }
+
+        setPolynomialExpansionConsts(polyN_, polySigma_);
+        device::optflow_farneback::setUpdateMatricesConsts();
+
+        for (int k = numLevelsCropped; k >= 0; k--)
+        {
+            streams[0].waitForCompletion();
+
+            scale = 1;
+            for (int i = 0; i < k; i++)
+                scale *= pyrScale_;
+
+            double sigma = (1./scale - 1) * 0.5;
+            int smoothSize = cvRound(sigma*5) | 1;
+            smoothSize = std::max(smoothSize, 3);
+
+            int width = cvRound(size.width*scale);
+            int height = cvRound(size.height*scale);
+
+            if (fastPyramids_)
+            {
+                width = pyramid0_[k].cols;
+                height = pyramid0_[k].rows;
+            }
+
+            if (k > 0)
+            {
+                curFlowX.create(height, width, CV_32F);
+                curFlowY.create(height, width, CV_32F);
            }
            else
            {
-                curFlowX.setTo(0, streams[0]);
-                curFlowY.setTo(0, streams[1]);
+                curFlowX = flowx0;
+                curFlowY = flowy0;
            }
-        }
-        else
-        {
-            cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-            cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
-            curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
-        }

-        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
-        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
-        GpuMat R[2] =
-        {
-            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
-            allocMatFromBuf(5*height, width, CV_32F, R_[1])
-        };
-
-        if (fastPyramids)
-        {
-            device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
-            device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
-        }
-        else
-        {
-            GpuMat blurredFrame[2] =
+            if (!prevFlowX.data)
            {
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
-            };
-            GpuMat pyrLevel[2] =
-            {
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
-            };
-
-            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
-            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
-
-            for (int i = 0; i < 2; i++)
-            {
-                device::optflow_farneback::gaussianBlurGpu(
-                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
-                cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
-                device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
+                if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
+                {
+                    cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                    cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                    curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                    curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+                }
+                else
+                {
+                    curFlowX.setTo(0, streams[0]);
+                    curFlowY.setTo(0, streams[1]);
+                }
            }
-        }
-
-        streams[1].waitForCompletion();
-        device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
-
-        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-        {
-            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
-            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
-        }
-        for (int i = 0; i < numIters; i++)
-        {
-            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
            else
-                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
+            {
+                cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale_, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale_, streams[1]);
+            }
+
+            GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
+            GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
+            GpuMat R[2] =
+            {
+                allocMatFromBuf(5*height, width, CV_32F, R_[0]),
+                allocMatFromBuf(5*height, width, CV_32F, R_[1])
+            };
+
+            if (fastPyramids_)
+            {
+                device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN_, R[0], StreamAccessor::getStream(streams[0]));
+                device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN_, R[1], StreamAccessor::getStream(streams[1]));
+            }
+            else
+            {
+                GpuMat blurredFrame[2] =
+                {
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+                };
+                GpuMat pyrLevel[2] =
+                {
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
+                };
+
+                Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
+
+                for (int i = 0; i < 2; i++)
+                {
+                    device::optflow_farneback::gaussianBlurGpu(
+                            frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, StreamAccessor::getStream(streams[i]));
+                    cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
+                    device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN_, R[i], StreamAccessor::getStream(streams[i]));
+                }
+            }
+
+            streams[1].waitForCompletion();
+            device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, StreamAccessor::getStream(streams[0]));
+
+            if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
+            {
+                Mat g = getGaussianKernel(winSize_, winSize_/2*0.3f, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize_/2), winSize_/2);
+            }
+            for (int i = 0; i < numIters_; i++)
+            {
+                if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
+                    updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
+                else
+                    updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
+            }
+
+            prevFlowX = curFlowX;
+            prevFlowY = curFlowY;
        }

-        prevFlowX = curFlowX;
-        prevFlowY = curFlowY;
+        flowx = curFlowX;
+        flowy = curFlowY;
+
+        if (!stream)
+            streams[0].waitForCompletion();
    }
+}

-    flowx = curFlowX;
-    flowy = curFlowY;
-
-    if (!S(s))
-        streams[0].waitForCompletion();
+Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                                                 int numIters, int polyN, double polySigma, int flags)
+{
+    return makePtr<FarnebackOpticalFlowImpl>(numLevels, pyrScale, fastPyramids, winSize,
+                                             numIters, polyN, polySigma, flags);
 }

 #endif
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@ -47,37 +47,54 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::releaseMemory() {}
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
+
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }

 #else /* !defined (HAVE_CUDA) */

 namespace pyrlk
 {
-    void loadConstants(int2 winSize, int iters);
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream);

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);
    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);

    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-}
-
-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
+               PtrStepSzf err, int2 winSize, cudaStream_t stream);
 }

 namespace
 {
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
+    class PyrLKOpticalFlowBase
+    {
+    public:
+        PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow);
+
+        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
+
+        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
+
+    protected:
+        Size winSize_;
+        int maxLevel_;
+        int iters_;
+        bool useInitialFlow_;
+
+    private:
+        std::vector<GpuMat> prevPyr_;
+        std::vector<GpuMat> nextPyr_;
+    };
+
+    PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+        winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow)
+    {
+    }
+
+    void calcPatchSize(Size winSize, dim3& block, dim3& patch)
    {
        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
        {
@ -95,156 +112,239 @@ namespace

        block.z = patch.z = 1;
    }
-}

-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
-{
-    if (prevPts.empty())
+    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
    {
-        nextPts.release();
-        status.release();
-        if (err) err->release();
-        return;
-    }
-
-    dim3 block, patch;
-    calcPatchSize(winSize, block, patch);
-
-    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
-
-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
-
-    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    GpuMat temp2 = nextPts.reshape(1);
-    cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
-
-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
-
-    if (err)
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    int cn = prevImg.channels();
-
-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-    else
-    {
-        cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
-
-        cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
-    }
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        if (cn == 1)
+        if (prevPts.empty())
        {
-            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            nextPts.release();
+            status.release();
+            if (err) err->release();
+            return;
+        }
+
+        dim3 block, patch;
+        calcPatchSize(winSize_, block, patch);
+
+        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
+        CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
+        CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
+
+        if (useInitialFlow_)
+            CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() );
+        else
+            ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+
+        GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1);
+        GpuMat temp2 = nextPts.reshape(1);
+        cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
+
+        ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+        status.setTo(Scalar::all(1), stream);
+
+        if (err)
+            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+
+        // build the image pyramids.
+
+        BufferPool pool(stream);
+
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
+
+        int cn = prevImg.channels();
+
+        if (cn == 1 || cn == 4)
+        {
+            prevImg.convertTo(prevPyr_[0], CV_32F, stream);
+            nextImg.convertTo(nextPyr_[0], CV_32F, stream);
        }
        else
        {
-            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));
+
+            cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(prevPyr_[0], CV_32F, stream);
+
+            cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(nextPyr_[0], CV_32F, stream);
+        }
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+        }
+
+        pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
+
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            if (cn == 1)
+            {
+                pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
+            else
+            {
+                pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
        }
    }
-}

-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-
-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
-
-    for (int level = 1; level <= maxLevel; ++level)
+    void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
    {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+        CV_Assert( prevImg.type() == CV_8UC1 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
+
+        // build the image pyramids.
+
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
+
+        prevPyr_[0] = prevImg;
+        nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+        }
+
+        BufferPool pool(stream);
+
+        GpuMat uPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+        GpuMat vPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+
+        uPyr[0].setTo(Scalar::all(0), stream);
+        vPyr[0].setTo(Scalar::all(0), stream);
+        uPyr[1].setTo(Scalar::all(0), stream);
+        vPyr[1].setTo(Scalar::all(0), stream);
+
+        int2 winSize2i = make_int2(winSize_.width, winSize_.height);
+        pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream));
+
+        int idx = 0;
+
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            int idx2 = (idx + 1) & 1;
+
+            pyrlk::dense(prevPyr_[level], nextPyr_[level],
+                         uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
+                         PtrStepSzf(), winSize2i,
+                         StreamAccessor::getStream(stream));
+
+            if (level > 0)
+                idx = idx2;
+        }
+
+        uPyr[idx].copyTo(u, stream);
+        vPyr[idx].copyTo(v, stream);
    }

-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[0].setTo(Scalar::all(0));
-    vPyr_[0].setTo(Scalar::all(0));
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
-
-    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    pyrlk::loadConstants(winSize2i, iters);
-
-    PtrStepSzf derr = err ? *err : PtrStepSzf();
-
-    int idx = 0;
-
-    for (int level = maxLevel; level >= 0; level--)
+    class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase
    {
-        int idx2 = (idx + 1) & 1;
+    public:
+        SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }

-        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : PtrStepSzf(), winSize2i);
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }

-        if (level > 0)
-            idx = idx2;
-    }
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }

-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray _prevImg, InputArray _nextImg,
+                          InputArray _prevPts, InputOutputArray _nextPts,
+                          OutputArray _status,
+                          OutputArray _err,
+                          Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
+            const GpuMat prevPts = _prevPts.getGpuMat();
+            GpuMat& nextPts = _nextPts.getGpuMatRef();
+            GpuMat& status = _status.getGpuMatRef();
+            GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
+
+            sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+        }
+    };
+
+    class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase
+    {
+    public:
+        DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }
+
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }
+
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
+
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
+
+            BufferPool pool(stream);
+            GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1);
+            GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1);
+
+            dense(prevImg, nextImg, u, v, stream);
+
+            GpuMat flows[] = {u, v};
+            cuda::merge(flows, 2, _flow, stream);
+        }
+    };
 }

-void cv::cuda::PyrLKOpticalFlow::releaseMemory()
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
 {
-    prevPyr_.clear();
-    nextPyr_.clear();
+    return makePtr<SparsePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
+}

-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
-
-    uPyr_[1].release();
-    vPyr_[1].release();
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
+{
+    return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
@ -44,257 +44,338 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA() { throw_no_cuda(); }
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage() {}
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(double, double, double, int, int, double, int, double, double, bool) { throw_no_cuda(); return Ptr<OpticalFlowDual_TVL1>(); }

 #else

 using namespace cv;
 using namespace cv::cuda;

-cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA()
-{
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    iterations     = 300;
-    scaleStep      = 0.8;
-    gamma           = 0.0;
-    useInitialFlow = false;
-}
-
-void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
-{
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
-    CV_Assert( nscales > 0 );
-
-    // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
-    u3s.resize(nscales);
-
-    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-
-    if (!useInitialFlow)
-    {
-        flowx.create(I0.size(), CV_32FC1);
-        flowy.create(I0.size(), CV_32FC1);
-    }
-
-    u1s[0] = flowx;
-    u2s[0] = flowy;
-    if (gamma)
-        u3s[0].create(I0.size(), CV_32FC1);
-
-    I1x_buf.create(I0.size(), CV_32FC1);
-    I1y_buf.create(I0.size(), CV_32FC1);
-
-    I1w_buf.create(I0.size(), CV_32FC1);
-    I1wx_buf.create(I0.size(), CV_32FC1);
-    I1wy_buf.create(I0.size(), CV_32FC1);
-
-    grad_buf.create(I0.size(), CV_32FC1);
-    rho_c_buf.create(I0.size(), CV_32FC1);
-
-    p11_buf.create(I0.size(), CV_32FC1);
-    p12_buf.create(I0.size(), CV_32FC1);
-    p21_buf.create(I0.size(), CV_32FC1);
-    p22_buf.create(I0.size(), CV_32FC1);
-    if (gamma)
-    {
-        p31_buf.create(I0.size(), CV_32FC1);
-        p32_buf.create(I0.size(), CV_32FC1);
-    }
-    diff_buf.create(I0.size(), CV_32FC1);
-
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
-        cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
-
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-
-        if (useInitialFlow)
-        {
-            cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
-            cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
-
-            cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
-            cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
-        }
-        else
-        {
-            u1s[s].create(I0s[s].size(), CV_32FC1);
-            u2s[s].create(I0s[s].size(), CV_32FC1);
-        }
-        if (gamma)
-            u3s[s].create(I0s[s].size(), CV_32FC1);
-    }
-
-    if (!useInitialFlow)
-    {
-        u1s[nscales-1].setTo(Scalar::all(0));
-        u2s[nscales-1].setTo(Scalar::all(0));
-    }
-    if (gamma)
-        u3s[nscales - 1].setTo(Scalar::all(0));
-
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]);
-
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-
-        // otherwise, upsample the optical flow
-
-        // zoom the optical flow for the next finer scale
-        cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
-        if (gamma)
-            cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());
-
-        // scale the optical flow with the appropriate zoom factor
-        cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
-        cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
-    }
-}
-
 namespace tvl1flow
 {
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2,
+                      PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy,
+                      PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream);
    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
                   PtrStepSzf grad, PtrStepSzf rho_c,
                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError);
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, const float gamma);
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream);
 }

-void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3)
+namespace
 {
-    using namespace tvl1flow;
-
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
-
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
-
-    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    centeredGradient(I1, I1x, I1y);
-
-    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p31, p32;
-    if (gamma)
+    class OpticalFlowDual_TVL1_Impl : public OpticalFlowDual_TVL1
    {
-        p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
-        p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
-    }
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-    if (gamma)
-    {
-        p31.setTo(Scalar::all(0));
-        p32.setTo(Scalar::all(0));
-    }
-
-    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
-
-        double error = std::numeric_limits<double>::max();
-        double prevError = 0.0;
-        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+    public:
+        OpticalFlowDual_TVL1_Impl(double tau, double lambda, double theta, int nscales, int warps, double epsilon,
+                                  int iterations, double scaleStep, double gamma, bool useInitialFlow) :
+            tau_(tau), lambda_(lambda), gamma_(gamma), theta_(theta), nscales_(nscales), warps_(warps),
+            epsilon_(epsilon), iterations_(iterations), scaleStep_(scaleStep), useInitialFlow_(useInitialFlow)
        {
-            // some tweaks to make sum operation less frequently
-            bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
-            cv::Mat m1(u3);
-            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta), gamma, calcError);
-            if (calcError)
+        }
+
+        virtual double getTau() const { return tau_; }
+        virtual void setTau(double tau) { tau_ = tau; }
+
+        virtual double getLambda() const { return lambda_; }
+        virtual void setLambda(double lambda) { lambda_ = lambda; }
+
+        virtual double getGamma() const { return gamma_; }
+        virtual void setGamma(double gamma) { gamma_ = gamma; }
+
+        virtual double getTheta() const { return theta_; }
+        virtual void setTheta(double theta) { theta_ = theta; }
+
+        virtual int getNumScales() const { return nscales_; }
+        virtual void setNumScales(int nscales) { nscales_ = nscales; }
+
+        virtual int getNumWarps() const { return warps_; }
+        virtual void setNumWarps(int warps) { warps_ = warps; }
+
+        virtual double getEpsilon() const { return epsilon_; }
+        virtual void setEpsilon(double epsilon) { epsilon_ = epsilon; }
+
+        virtual int getNumIterations() const { return iterations_; }
+        virtual void setNumIterations(int iterations) { iterations_ = iterations; }
+
+        virtual double getScaleStep() const { return scaleStep_; }
+        virtual void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+    private:
+        double tau_;
+        double lambda_;
+        double gamma_;
+        double theta_;
+        int nscales_;
+        int warps_;
+        double epsilon_;
+        int iterations_;
+        double scaleStep_;
+        bool useInitialFlow_;
+
+    private:
+        void calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream);
+        void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& stream);
+
+        std::vector<GpuMat> I0s;
+        std::vector<GpuMat> I1s;
+        std::vector<GpuMat> u1s;
+        std::vector<GpuMat> u2s;
+        std::vector<GpuMat> u3s;
+
+        GpuMat I1x_buf;
+        GpuMat I1y_buf;
+
+        GpuMat I1w_buf;
+        GpuMat I1wx_buf;
+        GpuMat I1wy_buf;
+
+        GpuMat grad_buf;
+        GpuMat rho_c_buf;
+
+        GpuMat p11_buf;
+        GpuMat p12_buf;
+        GpuMat p21_buf;
+        GpuMat p22_buf;
+        GpuMat p31_buf;
+        GpuMat p32_buf;
+
+        GpuMat diff_buf;
+        GpuMat norm_buf;
+    };
+
+    void OpticalFlowDual_TVL1_Impl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+
+    void OpticalFlowDual_TVL1_Impl::calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream)
+    {
+        CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+        CV_Assert( I0.size() == I1.size() );
+        CV_Assert( I0.type() == I1.type() );
+        CV_Assert( !useInitialFlow_ || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+        CV_Assert( nscales_ > 0 );
+
+        // allocate memory for the pyramid structure
+        I0s.resize(nscales_);
+        I1s.resize(nscales_);
+        u1s.resize(nscales_);
+        u2s.resize(nscales_);
+        u3s.resize(nscales_);
+
+        I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0, stream);
+        I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0, stream);
+
+        if (!useInitialFlow_)
+        {
+            flowx.create(I0.size(), CV_32FC1);
+            flowy.create(I0.size(), CV_32FC1);
+        }
+
+        u1s[0] = flowx;
+        u2s[0] = flowy;
+        if (gamma_)
+        {
+            u3s[0].create(I0.size(), CV_32FC1);
+        }
+
+        I1x_buf.create(I0.size(), CV_32FC1);
+        I1y_buf.create(I0.size(), CV_32FC1);
+
+        I1w_buf.create(I0.size(), CV_32FC1);
+        I1wx_buf.create(I0.size(), CV_32FC1);
+        I1wy_buf.create(I0.size(), CV_32FC1);
+
+        grad_buf.create(I0.size(), CV_32FC1);
+        rho_c_buf.create(I0.size(), CV_32FC1);
+
+        p11_buf.create(I0.size(), CV_32FC1);
+        p12_buf.create(I0.size(), CV_32FC1);
+        p21_buf.create(I0.size(), CV_32FC1);
+        p22_buf.create(I0.size(), CV_32FC1);
+        if (gamma_)
+        {
+            p31_buf.create(I0.size(), CV_32FC1);
+            p32_buf.create(I0.size(), CV_32FC1);
+        }
+        diff_buf.create(I0.size(), CV_32FC1);
+
+        // create the scales
+        for (int s = 1; s < nscales_; ++s)
+        {
+            cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+            cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+            if (I0s[s].cols < 16 || I0s[s].rows < 16)
            {
-                error = cuda::sum(diff, norm_buf)[0];
-                prevError = error;
+                nscales_ = s;
+                break;
+            }
+
+            if (useInitialFlow_)
+            {
+                cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+                cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+                cuda::multiply(u1s[s], Scalar::all(scaleStep_), u1s[s], 1, -1, stream);
+                cuda::multiply(u2s[s], Scalar::all(scaleStep_), u2s[s], 1, -1, stream);
            }
            else
            {
-                error = std::numeric_limits<double>::max();
-                prevError -= scaledEpsilon;
+                u1s[s].create(I0s[s].size(), CV_32FC1);
+                u2s[s].create(I0s[s].size(), CV_32FC1);
+            }
+            if (gamma_)
+            {
+                u3s[s].create(I0s[s].size(), CV_32FC1);
+            }
+        }
+
+        if (!useInitialFlow_)
+        {
+            u1s[nscales_-1].setTo(Scalar::all(0), stream);
+            u2s[nscales_-1].setTo(Scalar::all(0), stream);
+        }
+        if (gamma_)
+        {
+            u3s[nscales_ - 1].setTo(Scalar::all(0), stream);
+        }
+
+        // pyramidal structure for computing the optical flow
+        for (int s = nscales_ - 1; s >= 0; --s)
+        {
+            // compute the optical flow at the current scale
+            procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s], stream);
+
+            // if this was the last scale, finish now
+            if (s == 0)
+                break;
+
+            // otherwise, upsample the optical flow
+
+            // zoom the optical flow for the next finer scale
+            cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            if (gamma_)
+            {
+                cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
            }

-            estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+            // scale the optical flow with the appropriate zoom factor
+            cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep_), u1s[s - 1], 1, -1, stream);
+            cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep_), u2s[s - 1], 1, -1, stream);
+        }
+    }
+
+    void OpticalFlowDual_TVL1_Impl::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& _stream)
+    {
+        using namespace tvl1flow;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        const double scaledEpsilon = epsilon_ * epsilon_ * I0.size().area();
+
+        CV_DbgAssert( I1.size() == I0.size() );
+        CV_DbgAssert( I1.type() == I0.type() );
+        CV_DbgAssert( u1.size() == I0.size() );
+        CV_DbgAssert( u2.size() == u1.size() );
+
+        GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+        centeredGradient(I1, I1x, I1y, stream);
+
+        GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p31, p32;
+        if (gamma_)
+        {
+            p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
+            p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
+        }
+        p11.setTo(Scalar::all(0), _stream);
+        p12.setTo(Scalar::all(0), _stream);
+        p21.setTo(Scalar::all(0), _stream);
+        p22.setTo(Scalar::all(0), _stream);
+        if (gamma_)
+        {
+            p31.setTo(Scalar::all(0), _stream);
+            p32.setTo(Scalar::all(0), _stream);
+        }
+
+        GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        const float l_t = static_cast<float>(lambda_ * theta_);
+        const float taut = static_cast<float>(tau_ / theta_);
+
+        for (int warpings = 0; warpings < warps_; ++warpings)
+        {
+            warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c, stream);
+
+            double error = std::numeric_limits<double>::max();
+            double prevError = 0.0;
+            for (int n = 0; error > scaledEpsilon && n < iterations_; ++n)
+            {
+                // some tweaks to make sum operation less frequently
+                bool calcError = (epsilon_ > 0) && (n & 0x1) && (prevError < scaledEpsilon);
+                estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta_), gamma_, calcError, stream);
+                if (calcError)
+                {
+                    _stream.waitForCompletion();
+                    error = cuda::sum(diff, norm_buf)[0];
+                    prevError = error;
+                }
+                else
+                {
+                    error = std::numeric_limits<double>::max();
+                    prevError -= scaledEpsilon;
+                }
+
+                estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma_, stream);
+            }
        }
    }
 }

-void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage()
+Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(
+            double tau, double lambda, double theta, int nscales, int warps,
+            double epsilon, int iterations, double scaleStep, double gamma, bool useInitialFlow)
 {
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
-    u3s.clear();
-
-    I1x_buf.release();
-    I1y_buf.release();
-
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
-
-    grad_buf.release();
-    rho_c_buf.release();
-
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
-    if (gamma)
-    {
-        p31_buf.release();
-        p32_buf.release();
-    }
-    diff_buf.release();
-    norm_buf.release();
+    return makePtr<OpticalFlowDual_TVL1_Impl>(tau, lambda, theta, nscales, warps,
+                                              epsilon, iterations, scaleStep, gamma, useInitialFlow);
 }

 #endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@ -71,12 +71,18 @@ CUDA_TEST_P(BroxOpticalFlow, Regression)
    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
    ASSERT_FALSE(frame1.empty());

-    cv::cuda::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);

-    cv::cuda::GpuMat u;
-    cv::cuda::GpuMat v;
-    brox(loadMat(frame0), loadMat(frame1), u, v);
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];

    std::string fname(cvtest::TS::ptr()->get_data_path());
    if (devInfo.majorVersion() >= 2)
@ -133,12 +139,18 @@ CUDA_TEST_P(BroxOpticalFlow, OpticalFlowNan)
    cv::resize(frame0, r_frame0, cv::Size(1380,1000));
    cv::resize(frame1, r_frame1, cv::Size(1380,1000));

-    cv::cuda::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  5 /*inner_iterations*/, 150 /*outer_iterations*/, 10 /*solver_iterations*/);
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);

-    cv::cuda::GpuMat u;
-    cv::cuda::GpuMat v;
-    brox(loadMat(r_frame0), loadMat(r_frame1), u, v);
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];

    cv::Mat h_u, h_v;
    u.download(h_u);
@ -193,11 +205,12 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
    cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
    d_pts.upload(pts_mat);

-    cv::cuda::PyrLKOpticalFlow pyrLK;
+    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
+            cv::cuda::SparsePyrLKOpticalFlow::create();

    cv::cuda::GpuMat d_nextPts;
    cv::cuda::GpuMat d_status;
-    pyrLK.sparse(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
+    pyrLK->calc(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);

    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]);
@ -285,34 +298,30 @@ CUDA_TEST_P(FarnebackOpticalFlow, Accuracy)

    double polySigma = polyN <= 5 ? 1.1 : 1.5;

-    cv::cuda::FarnebackOpticalFlow farn;
-    farn.pyrScale = pyrScale;
-    farn.polyN = polyN;
-    farn.polySigma = polySigma;
-    farn.flags = flags;
+    cv::Ptr<cv::cuda::FarnebackOpticalFlow> farn =
+            cv::cuda::FarnebackOpticalFlow::create();
+    farn->setPyrScale(pyrScale);
+    farn->setPolyN(polyN);
+    farn->setPolySigma(polySigma);
+    farn->setFlags(flags);

-    cv::cuda::GpuMat d_flowx, d_flowy;
-    farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+    cv::cuda::GpuMat d_flow;
+    farn->calc(loadMat(frame0), loadMat(frame1), d_flow);

    cv::Mat flow;
    if (useInitFlow)
    {
-        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-        cv::merge(flowxy, 2, flow);
+        d_flow.download(flow);

-        farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-        farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
+        farn->setFlags(farn->getFlags() | cv::OPTFLOW_USE_INITIAL_FLOW);
+        farn->calc(loadMat(frame0), loadMat(frame1), d_flow);
    }

    cv::calcOpticalFlowFarneback(
-        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
+        frame0, frame1, flow, farn->getPyrScale(), farn->getNumLevels(), farn->getWinSize(),
+        farn->getNumIters(), farn->getPolyN(), farn->getPolySigma(), farn->getFlags());

-    std::vector<cv::Mat> flowxy;
-    cv::split(flow, flowxy);
-
-    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
-    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
+    EXPECT_MAT_SIMILAR(flow, d_flow, 0.1);
 }

 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
@ -325,15 +334,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
 //////////////////////////////////////////////////////
 // OpticalFlowDual_TVL1

-PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::cuda::DeviceInfo, UseRoi)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Gamma, double)
+}
+
+PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::cuda::DeviceInfo, Gamma)
 {
    cv::cuda::DeviceInfo devInfo;
-    bool useRoi;
+    double gamma;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
+        gamma = GET_PARAM(1);

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -347,156 +361,28 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(frame1.empty());

-    cv::cuda::OpticalFlowDual_TVL1_CUDA d_alg;
-    cv::cuda::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi);
-    cv::cuda::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
-    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
+    cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+            cv::cuda::OpticalFlowDual_TVL1::create();
+    d_alg->setNumIterations(10);
+    d_alg->setGamma(gamma);
+
+    cv::cuda::GpuMat d_flow;
+    d_alg->calc(loadMat(frame0), loadMat(frame1), d_flow);

    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
    alg->set("medianFiltering", 1);
    alg->set("innerIterations", 1);
-    alg->set("outerIterations", d_alg.iterations);
+    alg->set("outerIterations", d_alg->getNumIterations());
+    alg->set("gamma", gamma);
+
    cv::Mat flow;
    alg->calc(frame0, frame1, flow);
-    cv::Mat gold[2];
-    cv::split(flow, gold);
-    cv::Mat mx(d_flowx);
-    cv::Mat my(d_flowx);

-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
-    d_alg.gamma = 1;
-    alg->set("gamma", 1);
-    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
-    alg->calc(frame0, frame1, flow);
-    cv::split(flow, gold);
-    mx = cv::Mat(d_flowx);
-    my = cv::Mat(d_flowx);
-
-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
+    EXPECT_MAT_SIMILAR(flow, d_flow, 4e-3);
 }

 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
    ALL_DEVICES,
-    WHOLE_SUBMAT));
-
-//////////////////////////////////////////////////////
-// FastOpticalFlowBM
-
-namespace
-{
-    void FastOpticalFlowBM_gold(const cv::Mat_<uchar>& I0, const cv::Mat_<uchar>& I1, cv::Mat_<float>& velx, cv::Mat_<float>& vely, int search_window, int block_window)
-    {
-        velx.create(I0.size());
-        vely.create(I0.size());
-
-        int search_radius = search_window / 2;
-        int block_radius = block_window / 2;
-
-        for (int y = 0; y < I0.rows; ++y)
-        {
-            for (int x = 0; x < I0.cols; ++x)
-            {
-                int bestDist = std::numeric_limits<int>::max();
-                int bestDx = 0;
-                int bestDy = 0;
-
-                for (int dy = -search_radius; dy <= search_radius; ++dy)
-                {
-                    for (int dx = -search_radius; dx <= search_radius; ++dx)
-                    {
-                        int dist = 0;
-
-                        for (int by = -block_radius; by <= block_radius; ++by)
-                        {
-                            for (int bx = -block_radius; bx <= block_radius; ++bx)
-                            {
-                                int I0_val = I0(cv::borderInterpolate(y + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + bx, I0.cols, cv::BORDER_DEFAULT));
-                                int I1_val = I1(cv::borderInterpolate(y + dy + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + dx + bx, I0.cols, cv::BORDER_DEFAULT));
-
-                                dist += std::abs(I0_val - I1_val);
-                            }
-                        }
-
-                        if (dist < bestDist)
-                        {
-                            bestDist = dist;
-                            bestDx = dx;
-                            bestDy = dy;
-                        }
-                    }
-                }
-
-                velx(y, x) = (float) bestDx;
-                vely(y, x) = (float) bestDy;
-            }
-        }
-    }
-
-    double calc_rmse(const cv::Mat_<float>& flow1, const cv::Mat_<float>& flow2)
-    {
-        double sum = 0.0;
-
-        for (int y = 0; y < flow1.rows; ++y)
-        {
-            for (int x = 0; x < flow1.cols; ++x)
-            {
-                double diff = flow1(y, x) - flow2(y, x);
-                sum += diff * diff;
-            }
-        }
-
-        return std::sqrt(sum / flow1.size().area());
-    }
-}
-
-struct FastOpticalFlowBM : testing::TestWithParam<cv::cuda::DeviceInfo>
-{
-};
-
-CUDA_TEST_P(FastOpticalFlowBM, Accuracy)
-{
-    const double MAX_RMSE = 0.6;
-
-    int search_window = 15;
-    int block_window = 5;
-
-    cv::cuda::DeviceInfo devInfo = GetParam();
-    cv::cuda::setDevice(devInfo.deviceID());
-
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Size smallSize(320, 240);
-    cv::Mat frame0_small;
-    cv::Mat frame1_small;
-
-    cv::resize(frame0, frame0_small, smallSize);
-    cv::resize(frame1, frame1_small, smallSize);
-
-    cv::cuda::GpuMat d_flowx;
-    cv::cuda::GpuMat d_flowy;
-    cv::cuda::FastOpticalFlowBM fastBM;
-
-    fastBM(loadMat(frame0_small), loadMat(frame1_small), d_flowx, d_flowy, search_window, block_window);
-
-    cv::Mat_<float> flowx;
-    cv::Mat_<float> flowy;
-    FastOpticalFlowBM_gold(frame0_small, frame1_small, flowx, flowy, search_window, block_window);
-
-    double err;
-
-    err = calc_rmse(flowx, cv::Mat(d_flowx));
-    EXPECT_LE(err, MAX_RMSE);
-
-    err = calc_rmse(flowy, cv::Mat(d_flowy));
-    EXPECT_LE(err, MAX_RMSE);
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FastOpticalFlowBM, ALL_DEVICES);
+    testing::Values(Gamma(0.0), Gamma(1.0))));

 #endif // HAVE_CUDA
--- a/modules/cudaoptflow/test/test_precomp.hpp
+++ b/modules/cudaoptflow/test/test_precomp.hpp
@ -57,6 +57,7 @@
 #include "opencv2/ts/cuda_test.hpp"

 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"

 #include "cvconfig.h"
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@ -341,7 +341,7 @@ namespace
        int iterations_;
        bool useInitialFlow_;

-        Ptr<DenseOpticalFlow> alg_;
+        Ptr<cv::DenseOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(DualTVL1, "DenseOpticalFlowExt.DualTVL1",
@ -514,7 +514,7 @@ namespace
        int outerIterations_;
        int solverIterations_;

-        BroxOpticalFlow alg_;
+        Ptr<cuda::BroxOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(Brox_CUDA, "DenseOpticalFlowExt.Brox_CUDA",
@ -525,31 +525,40 @@ namespace
                      obj.info()->addParam(obj, "outerIterations", obj.outerIterations_, false, 0, 0, "Number of warping iterations (number of pyramid levels)");
                      obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"))

-    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1), alg_(0.197f, 50.0f, 0.8f, 10, 77, 10)
+    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1)
    {
-        alpha_ = alg_.alpha;
-        gamma_ = alg_.gamma;
-        scaleFactor_ = alg_.scale_factor;
-        innerIterations_ = alg_.inner_iterations;
-        outerIterations_ = alg_.outer_iterations;
-        solverIterations_ = alg_.solver_iterations;
+        alg_ = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+
+        alpha_ = alg_->getFlowSmoothness();
+        gamma_ = alg_->getGradientConstancyImportance();
+        scaleFactor_ = alg_->getPyramidScaleFactor();
+        innerIterations_ = alg_->getInnerIterations();
+        outerIterations_ = alg_->getOuterIterations();
+        solverIterations_ = alg_->getSolverIterations();
    }

    void Brox_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.alpha = static_cast<float>(alpha_);
-        alg_.gamma = static_cast<float>(gamma_);
-        alg_.scale_factor = static_cast<float>(scaleFactor_);
-        alg_.inner_iterations = innerIterations_;
-        alg_.outer_iterations = outerIterations_;
-        alg_.solver_iterations = solverIterations_;
+        alg_->setFlowSmoothness(alpha_);
+        alg_->setGradientConstancyImportance(gamma_);
+        alg_->setPyramidScaleFactor(scaleFactor_);
+        alg_->setInnerIterations(innerIterations_);
+        alg_->setOuterIterations(outerIterations_);
+        alg_->setSolverIterations(solverIterations_);

-        alg_(input0, input1, dst1, dst2);
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void Brox_CUDA::collectGarbage()
    {
-        alg_.buf.release();
+        alg_ = cuda::BroxOpticalFlow::create(alpha_, gamma_, scaleFactor_, innerIterations_, outerIterations_, solverIterations_);
        GpuOpticalFlow::collectGarbage();
    }
 }
@ -581,7 +590,7 @@ namespace
        int maxLevel_;
        int iterations_;

-        PyrLKOpticalFlow alg_;
+        Ptr<cuda::DensePyrLKOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(PyrLK_CUDA, "DenseOpticalFlowExt.PyrLK_CUDA",
@ -591,24 +600,32 @@ namespace

    PyrLK_CUDA::PyrLK_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        winSize_ = alg_.winSize.width;
-        maxLevel_ = alg_.maxLevel;
-        iterations_ = alg_.iters;
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
+
+        winSize_ = alg_->getWinSize().width;
+        maxLevel_ = alg_->getMaxLevel();
+        iterations_ = alg_->getNumIters();
    }

    void PyrLK_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.winSize.width = winSize_;
-        alg_.winSize.height = winSize_;
-        alg_.maxLevel = maxLevel_;
-        alg_.iters = iterations_;
+        alg_->setWinSize(Size(winSize_, winSize_));
+        alg_->setMaxLevel(maxLevel_);
+        alg_->setNumIters(iterations_);

-        alg_.dense(input0, input1, dst1, dst2);
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void PyrLK_CUDA::collectGarbage()
    {
-        alg_.releaseMemory();
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
        GpuOpticalFlow::collectGarbage();
    }
 }
@ -644,7 +661,7 @@ namespace
        double polySigma_;
        int flags_;

-        FarnebackOpticalFlow alg_;
+        Ptr<cuda::FarnebackOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(Farneback_CUDA, "DenseOpticalFlowExt.Farneback_CUDA",
@ -658,31 +675,40 @@ namespace

    Farneback_CUDA::Farneback_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        pyrScale_ = alg_.pyrScale;
-        numLevels_ = alg_.numLevels;
-        winSize_ = alg_.winSize;
-        numIters_ = alg_.numIters;
-        polyN_ = alg_.polyN;
-        polySigma_ = alg_.polySigma;
-        flags_ = alg_.flags;
+        alg_ = cuda::FarnebackOpticalFlow::create();
+
+        pyrScale_ = alg_->getPyrScale();
+        numLevels_ = alg_->getNumLevels();
+        winSize_ = alg_->getWinSize();
+        numIters_ = alg_->getNumIters();
+        polyN_ = alg_->getPolyN();
+        polySigma_ = alg_->getPolySigma();
+        flags_ = alg_->getFlags();
    }

    void Farneback_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.pyrScale = pyrScale_;
-        alg_.numLevels = numLevels_;
-        alg_.winSize = winSize_;
-        alg_.numIters = numIters_;
-        alg_.polyN = polyN_;
-        alg_.polySigma = polySigma_;
-        alg_.flags = flags_;
+        alg_->setPyrScale(pyrScale_);
+        alg_->setNumLevels(numLevels_);
+        alg_->setWinSize(winSize_);
+        alg_->setNumIters(numIters_);
+        alg_->setPolyN(polyN_);
+        alg_->setPolySigma(polySigma_);
+        alg_->setFlags(flags_);

-        alg_(input0, input1, dst1, dst2);
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void Farneback_CUDA::collectGarbage()
    {
-        alg_.releaseMemory();
+        alg_ = cuda::FarnebackOpticalFlow::create();
        GpuOpticalFlow::collectGarbage();
    }
 }
@ -719,7 +745,7 @@ namespace
        int iterations_;
        bool useInitialFlow_;

-        OpticalFlowDual_TVL1_CUDA alg_;
+        Ptr<cuda::OpticalFlowDual_TVL1> alg_;
    };

    CV_INIT_ALGORITHM(DualTVL1_CUDA, "DenseOpticalFlowExt.DualTVL1_CUDA",
@ -734,33 +760,42 @@ namespace

    DualTVL1_CUDA::DualTVL1_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        tau_ = alg_.tau;
-        lambda_ = alg_.lambda;
-        theta_ = alg_.theta;
-        nscales_ = alg_.nscales;
-        warps_ = alg_.warps;
-        epsilon_ = alg_.epsilon;
-        iterations_ = alg_.iterations;
-        useInitialFlow_ = alg_.useInitialFlow;
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
+
+        tau_ = alg_->getTau();
+        lambda_ = alg_->getLambda();
+        theta_ = alg_->getTheta();
+        nscales_ = alg_->getNumScales();
+        warps_ = alg_->getNumWarps();
+        epsilon_ = alg_->getEpsilon();
+        iterations_ = alg_->getNumIterations();
+        useInitialFlow_ = alg_->getUseInitialFlow();
    }

    void DualTVL1_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.tau = tau_;
-        alg_.lambda = lambda_;
-        alg_.theta = theta_;
-        alg_.nscales = nscales_;
-        alg_.warps = warps_;
-        alg_.epsilon = epsilon_;
-        alg_.iterations = iterations_;
-        alg_.useInitialFlow = useInitialFlow_;
+        alg_->setTau(tau_);
+        alg_->setLambda(lambda_);
+        alg_->setTheta(theta_);
+        alg_->setNumScales(nscales_);
+        alg_->setNumWarps(warps_);
+        alg_->setEpsilon(epsilon_);
+        alg_->setNumIterations(iterations_);
+        alg_->setUseInitialFlow(useInitialFlow_);

-        alg_(input0, input1, dst1, dst2);
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void DualTVL1_CUDA::collectGarbage()
    {
-        alg_.collectGarbage();
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
        GpuOpticalFlow::collectGarbage();
    }
 }
--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@ -121,7 +121,7 @@ public:
             cuda::GpuMat &status);

 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::SparsePyrLKOpticalFlow> optFlowEstimator_;
    cuda::GpuMat frame0_, frame1_, points0_, points1_, status_, errors_;
 };

@ -136,7 +136,7 @@ public:
            OutputArray errors);

 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::DensePyrLKOpticalFlow> optFlowEstimator_;
    cuda::GpuMat frame0_, frame1_, flowX_, flowY_, errors_;
 };

--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@ -45,6 +45,10 @@
 #include "opencv2/videostab/optical_flow.hpp"
 #include "opencv2/videostab/ring_buffer.hpp"

+#ifdef HAVE_OPENCV_CUDAARITHM
+  #include "opencv2/cudaarithm.hpp"
+#endif
+
 namespace cv
 {
 namespace videostab
@ -63,6 +67,7 @@ void SparsePyrLkOptFlowEstimator::run(
 SparsePyrLkOptFlowEstimatorGpu::SparsePyrLkOptFlowEstimatorGpu()
 {
    CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::SparsePyrLKOpticalFlow::create();
 }


@ -91,9 +96,9 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
        const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
        cuda::GpuMat &points1, cuda::GpuMat &status, cuda::GpuMat &errors)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status, &errors);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status, errors);
 }


@ -101,15 +106,16 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
        const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
        cuda::GpuMat &points1, cuda::GpuMat &status)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status);
 }


 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
    CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::DensePyrLKOpticalFlow::create();
 }


@ -120,16 +126,24 @@ void DensePyrLkOptFlowEstimatorGpu::run(
    frame0_.upload(frame0.getMat());
    frame1_.upload(frame1.getMat());

-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);

    if (errors.needed())
    {
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_, &errors_);
-        errors_.download(errors.getMatRef());
+        CV_Error(Error::StsNotImplemented, "DensePyrLkOptFlowEstimatorGpu doesn't support errors calculation");
    }
    else
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_);
+    {
+        cuda::GpuMat flow;
+        optFlowEstimator_->calc(frame0_, frame1_, flow);
+
+        cuda::GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        flowX_ = flows[0];
+        flowY_ = flows[1];
+    }

    flowX_.download(flowX.getMatRef());
    flowY_.download(flowY.getMatRef());
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@ -1,270 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <ctype.h>
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/cudaoptflow.hpp"
-#include "opencv2/cudaarithm.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::cuda;
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
-
-int main(int argc, const char* argv[])
-{
-    try
-    {
-        const char* keys =
-           "{ h   help      |       | print help message }"
-           "{ l   left      |       | specify left image }"
-           "{ r   right     |       | specify right image }"
-           "{ s   scale     | 0.8   | set pyramid scale factor }"
-           "{ a   alpha     | 0.197 | set alpha }"
-           "{ g   gamma     | 50.0  | set gamma }"
-           "{ i   inner     | 10    | set number of inner iterations }"
-           "{ o   outer     | 77    | set number of outer iterations }"
-           "{ si  solver    | 10    | set number of basic solver iterations }"
-           "{ t   time_step | 0.1   | set frame interpolation time step }";
-
-        CommandLineParser cmd(argc, argv, keys);
-
-        if (cmd.has("help") || !cmd.check())
-        {
-            cmd.printMessage();
-            cmd.printErrors();
-            return 0;
-        }
-
-        string frame0Name = cmd.get<string>("left");
-        string frame1Name = cmd.get<string>("right");
-        float scale = cmd.get<float>("scale");
-        float alpha = cmd.get<float>("alpha");
-        float gamma = cmd.get<float>("gamma");
-        int inner_iterations = cmd.get<int>("inner");
-        int outer_iterations = cmd.get<int>("outer");
-        int solver_iterations = cmd.get<int>("solver");
-        float timeStep = cmd.get<float>("time_step");
-
-        if (frame0Name.empty() || frame1Name.empty())
-        {
-            cerr << "Missing input file names" << endl;
-            return -1;
-        }
-
-        Mat frame0Color = imread(frame0Name);
-        Mat frame1Color = imread(frame1Name);
-
-        if (frame0Color.empty() || frame1Color.empty())
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-
-        cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
-
-        cout << "OpenCV / NVIDIA Computer Vision" << endl;
-        cout << "Optical Flow Demo: Frame Interpolation" << endl;
-        cout << "=========================================" << endl;
-
-        namedWindow("Forward flow");
-        namedWindow("Backward flow");
-
-        namedWindow("Interpolated frame");
-
-        cout << "Press:" << endl;
-        cout << "\tESC to quit" << endl;
-        cout << "\t'a' to move to the previous frame" << endl;
-        cout << "\t's' to move to the next frame\n" << endl;
-
-        frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
-        frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
-
-        Mat frame0Gray, frame1Gray;
-
-        cv::cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
-        cv::cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
-
-        GpuMat d_frame0(frame0Gray);
-        GpuMat d_frame1(frame1Gray);
-
-        cout << "Estimating optical flow" << endl;
-
-        BroxOpticalFlow d_flow(alpha, gamma, scale, inner_iterations, outer_iterations, solver_iterations);
-
-        cout << "\tForward..." << endl;
-
-        GpuMat d_fu, d_fv;
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-
-        Mat flowFieldForward;
-        getFlowField(Mat(d_fu), Mat(d_fv), flowFieldForward);
-
-        cout << "\tBackward..." << endl;
-
-        GpuMat d_bu, d_bv;
-
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        Mat flowFieldBackward;
-        getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
-
-        cout << "Interpolating..." << endl;
-
-        // first frame color components
-        GpuMat d_b, d_g, d_r;
-
-        // second frame color components
-        GpuMat d_bt, d_gt, d_rt;
-
-        // prepare color components on host and copy them to device memory
-        Mat channels[3];
-        cv::split(frame0Color, channels);
-
-        d_b.upload(channels[0]);
-        d_g.upload(channels[1]);
-        d_r.upload(channels[2]);
-
-        cv::split(frame1Color, channels);
-
-        d_bt.upload(channels[0]);
-        d_gt.upload(channels[1]);
-        d_rt.upload(channels[2]);
-
-        // temporary buffer
-        GpuMat d_buf;
-
-        // intermediate frame color components (GPU memory)
-        GpuMat d_rNew, d_gNew, d_bNew;
-
-        GpuMat d_newFrame;
-
-        vector<Mat> frames;
-        frames.reserve(static_cast<int>(1.0f / timeStep) + 2);
-
-        frames.push_back(frame0Color);
-
-        // compute interpolated frames
-        for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
-        {
-            // interpolate blue channel
-            interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
-
-            // interpolate green channel
-            interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
-
-            // interpolate red channel
-            interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
-
-            GpuMat channels3[] = {d_bNew, d_gNew, d_rNew};
-            cuda::merge(channels3, 3, d_newFrame);
-
-            frames.push_back(Mat(d_newFrame));
-
-            cout << setprecision(4) << timePos * 100.0f << "%\r";
-        }
-
-        frames.push_back(frame1Color);
-
-        cout << setw(5) << "100%" << endl;
-
-        cout << "Done" << endl;
-
-        imshow("Forward flow", flowFieldForward);
-        imshow("Backward flow", flowFieldBackward);
-
-        int currentFrame = 0;
-
-        imshow("Interpolated frame", frames[currentFrame]);
-
-        for(;;)
-        {
-            int key = toupper(waitKey(10) & 0xff);
-
-            switch (key)
-            {
-            case 27:
-                return 0;
-
-            case 'A':
-                if (currentFrame > 0)
-                    --currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-
-            case 'S':
-                if (currentFrame < static_cast<int>(frames.size()) - 1)
-                    ++currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-            }
-        }
-    }
-    catch (const exception& ex)
-    {
-        cerr << ex.what() << endl;
-        return -1;
-    }
-    catch (...)
-    {
-        cerr << "Unknow error" << endl;
-        return -1;
-    }
-}
-
-template <typename T> inline T clamp (T x, T a, T b)
-{
-    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
-}
-
-template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
-{
-    x = clamp(x, a, b);
-    return c + (d - c) * (x - a) / (b - a);
-}
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@ -7,6 +7,7 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"

 using namespace std;
 using namespace cv;
@ -70,8 +71,8 @@ int main(int argc, char **argv)
    if (frameL.empty() || frameR.empty()) return -1;

    GpuMat d_frameL(frameL), d_frameR(frameR);
-    GpuMat d_flowx, d_flowy;
-    FarnebackOpticalFlow d_calc;
+    GpuMat d_flow;
+    Ptr<cuda::FarnebackOpticalFlow> d_calc = cuda::FarnebackOpticalFlow::create();
    Mat flowxy, flowx, flowy, image;

    bool running = true, gpuMode = true;
@ -86,17 +87,21 @@ int main(int argc, char **argv)
        if (gpuMode)
        {
            tc0 = getTickCount();
-            d_calc(d_frameL, d_frameR, d_flowx, d_flowy);
+            d_calc->calc(d_frameL, d_frameR, d_flow);
            tc1 = getTickCount();
-            d_flowx.download(flowx);
-            d_flowy.download(flowy);
+
+            GpuMat planes[2];
+            cuda::split(d_flow, planes);
+
+            planes[0].download(flowx);
+            planes[1].download(flowy);
        }
        else
        {
            tc0 = getTickCount();
            calcOpticalFlowFarneback(
-                        frameL, frameR, flowxy, d_calc.pyrScale, d_calc.numLevels, d_calc.winSize,
-                        d_calc.numIters, d_calc.polyN, d_calc.polySigma, d_calc.flags);
+                        frameL, frameR, flowxy, d_calc->getPyrScale(), d_calc->getNumLevels(), d_calc->getWinSize(),
+                        d_calc->getNumIters(), d_calc->getPolyN(), d_calc->getPolySigma(), d_calc->getFlags());
            tc1 = getTickCount();

            Mat planes[] = {flowx, flowy};
--- a/samples/gpu/optical_flow.cpp
+++ b/samples/gpu/optical_flow.cpp
@ -5,6 +5,7 @@
 #include <opencv2/core/utility.hpp>
 #include "opencv2/highgui.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"

 using namespace std;
 using namespace cv;
@ -122,10 +123,13 @@ static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy,
    }
 }

-static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_flowy)
+static void showFlow(const char* name, const GpuMat& d_flow)
 {
-    Mat flowx(d_flowx);
-    Mat flowy(d_flowy);
+    GpuMat planes[2];
+    cuda::split(d_flow, planes);
+
+    Mat flowx(planes[0]);
+    Mat flowy(planes[1]);

    Mat out;
    drawOpticalFlow(flowx, flowy, out, 10);
@ -171,14 +175,12 @@ int main(int argc, const char* argv[])
    GpuMat d_frame0(frame0);
    GpuMat d_frame1(frame1);

-    GpuMat d_flowx(frame0.size(), CV_32FC1);
-    GpuMat d_flowy(frame0.size(), CV_32FC1);
+    GpuMat d_flow(frame0.size(), CV_32FC2);

-    BroxOpticalFlow brox(0.197f, 50.0f, 0.8f, 10, 77, 10);
-    PyrLKOpticalFlow lk; lk.winSize = Size(7, 7);
-    FarnebackOpticalFlow farn;
-    OpticalFlowDual_TVL1_CUDA tvl1;
-    FastOpticalFlowBM fastBM;
+    Ptr<cuda::BroxOpticalFlow> brox = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
+    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
+    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();

    {
        GpuMat d_frame0f;
@ -189,68 +191,45 @@ int main(int argc, const char* argv[])

        const int64 start = getTickCount();

-        brox(d_frame0f, d_frame1f, d_flowx, d_flowy);
+        brox->calc(d_frame0f, d_frame1f, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "Brox : " << timeSec << " sec" << endl;

-        showFlow("Brox", d_flowx, d_flowy);
+        showFlow("Brox", d_flow);
    }

    {
        const int64 start = getTickCount();

-        lk.dense(d_frame0, d_frame1, d_flowx, d_flowy);
+        lk->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "LK : " << timeSec << " sec" << endl;

-        showFlow("LK", d_flowx, d_flowy);
+        showFlow("LK", d_flow);
    }

    {
        const int64 start = getTickCount();

-        farn(d_frame0, d_frame1, d_flowx, d_flowy);
+        farn->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "Farn : " << timeSec << " sec" << endl;

-        showFlow("Farn", d_flowx, d_flowy);
+        showFlow("Farn", d_flow);
    }

    {
        const int64 start = getTickCount();

-        tvl1(d_frame0, d_frame1, d_flowx, d_flowy);
+        tvl1->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "TVL1 : " << timeSec << " sec" << endl;

-        showFlow("TVL1", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        GpuMat buf;
-        calcOpticalFlowBM(d_frame0, d_frame1, Size(7, 7), Size(1, 1), Size(21, 21), false, d_flowx, d_flowy, buf);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "BM : " << timeSec << " sec" << endl;
-
-        showFlow("BM", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        fastBM(d_frame0, d_frame1, d_flowx, d_flowy);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "Fast BM : " << timeSec << " sec" << endl;
-
-        showFlow("Fast BM", d_flowx, d_flowy);
+        showFlow("TVL1", d_flow);
    }

    imshow("Frame 0", frame0);
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -1187,87 +1187,6 @@ TEST(GoodFeaturesToTrack)
    CUDA_OFF;
 }

-TEST(PyrLKOpticalFlow)
-{
-    Mat frame0 = imread(abspath("../data/rubberwhale1.png"));
-    if (frame0.empty()) throw runtime_error("can't open ../data/rubberwhale1.png");
-
-    Mat frame1 = imread(abspath("../data/rubberwhale2.png"));
-    if (frame1.empty()) throw runtime_error("can't open ../data/rubberwhale2.png");
-
-    Mat gray_frame;
-    cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-
-    for (int points = 1000; points <= 8000; points *= 2)
-    {
-        SUBTEST << points;
-
-        vector<Point2f> pts;
-        goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-        vector<Point2f> nextPts;
-        vector<unsigned char> status;
-
-        vector<float> err;
-
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-        CPU_ON;
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-        CPU_OFF;
-
-        cuda::PyrLKOpticalFlow d_pyrLK;
-
-        cuda::GpuMat d_frame0(frame0);
-        cuda::GpuMat d_frame1(frame1);
-
-        cuda::GpuMat d_pts;
-        Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]);
-        d_pts.upload(pts_mat);
-
-        cuda::GpuMat d_nextPts;
-        cuda::GpuMat d_status;
-        cuda::GpuMat d_err;
-
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-        CUDA_ON;
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-        CUDA_OFF;
-    }
-}
-
-
-TEST(FarnebackOpticalFlow)
-{
-    const string datasets[] = {"../data/rubberwhale", "../data/basketball"};
-    for (size_t i = 0; i < sizeof(datasets)/sizeof(*datasets); ++i) {
-    for (int fastPyramids = 0; fastPyramids < 2; ++fastPyramids) {
-    for (int useGaussianBlur = 0; useGaussianBlur < 2; ++useGaussianBlur) {
-
-    SUBTEST << "dataset=" << datasets[i] << ", fastPyramids=" << fastPyramids << ", useGaussianBlur=" << useGaussianBlur;
-    Mat frame0 = imread(abspath(datasets[i] + "1.png"), IMREAD_GRAYSCALE);
-    Mat frame1 = imread(abspath(datasets[i] + "2.png"), IMREAD_GRAYSCALE);
-    if (frame0.empty()) throw runtime_error("can't open " + datasets[i] + "1.png");
-    if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png");
-
-    cuda::FarnebackOpticalFlow calc;
-    calc.fastPyramids = fastPyramids != 0;
-    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;
-
-    cuda::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;
-    CUDA_ON;
-    calc(d_frame0, d_frame1, d_flowx, d_flowy);
-    CUDA_OFF;
-
-    Mat flow;
-    CPU_ON;
-    calcOpticalFlowFarneback(frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize, calc.numIters, calc.polyN, calc.polySigma, calc.flags);
-    CPU_OFF;
-
-    }}}
-}
-
 #ifdef HAVE_OPENCV_BGSEGM

 TEST(MOG)
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@ -77,44 +77,6 @@ template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
    return c + (d - c) * (x - a) / (b - a);
 }

-static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
-
 int main(int argc, const char* argv[])
 {
    const char* keys =
@ -186,12 +148,8 @@ int main(int argc, const char* argv[])

    // Sparse

-    PyrLKOpticalFlow d_pyrLK;
-
-    d_pyrLK.winSize.width = winSize;
-    d_pyrLK.winSize.height = winSize;
-    d_pyrLK.maxLevel = maxLevel;
-    d_pyrLK.iters = iters;
+    Ptr<cuda::SparsePyrLKOpticalFlow> d_pyrLK = cuda::SparsePyrLKOpticalFlow::create(
+                Size(winSize, winSize), maxLevel, iters);

    GpuMat d_frame0(frame0);
    GpuMat d_frame1(frame1);
@ -199,7 +157,7 @@ int main(int argc, const char* argv[])
    GpuMat d_nextPts;
    GpuMat d_status;

-    d_pyrLK.sparse(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
+    d_pyrLK->calc(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);

    // Draw arrows

@ -216,20 +174,6 @@ int main(int argc, const char* argv[])

    imshow("PyrLK [Sparse]", frame0);

-    // Dense
-
-    GpuMat d_u;
-    GpuMat d_v;
-
-    d_pyrLK.dense(d_frame0Gray, d_frame1Gray, d_u, d_v);
-
-    // Draw flow field
-
-    Mat flowField;
-    getFlowField(Mat(d_u), Mat(d_v), flowField);
-
-    imshow("PyrLK [Dense] Flow Field", flowField);
-
    waitKey();

    return 0;