diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index d1fd1501ee..1ce35aabf3 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -214,6 +214,8 @@ namespace cv { namespace gpu
     CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
     CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
 
+    CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat);
+
     ////////////////////////////////////////////////////////////////////////
     // Error handling
 
@@ -459,6 +461,13 @@ namespace cv { namespace gpu
         else
             m.create(rows, cols, type);
     }
+
+    inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+        return mat = GpuMat(rows, cols, type);
+    }
 }}
 
 #endif // __cplusplus
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 39836bea39..20ce90239f 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1819,6 +1819,70 @@ private:
     vector<GpuMat> vPyr_;
 };
 
+
+class CV_EXPORTS FarnebackOpticalFlow
+{
+public:
+    FarnebackOpticalFlow()
+    {
+        numLevels = 5;
+        pyrScale = 0.5;
+        fastPyramids = false;
+        winSize = 13;
+        numIters = 10;
+        polyN = 5;
+        polySigma = 1.1;
+        flags = 0;
+    }
+
+    int numLevels;
+    double pyrScale;
+    bool fastPyramids;
+    int winSize;
+    int numIters;
+    int polyN;
+    double polySigma;
+    int flags;
+
+    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
+
+    void releaseMemory()
+    {
+        frames_[0].release();
+        frames_[1].release();
+        I_[0].release();
+        I_[1].release();
+        M_.release();
+        bufM_.release();
+        R_[0].release();
+        R_[1].release();
+        tmp_[0].release();
+        tmp_[1].release();
+        pyramid0_.clear();
+        pyramid1_.clear();
+    }
+
+private:
+    void prepareGaussian(
+            int n, double sigma, float *g, float *xg, float *xxg,
+            double &ig11, double &ig03, double &ig33, double &ig55);
+
+    void setPolynomialExpansionConsts(int n, double sigma);
+
+    void updateFlow_boxFilter(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+    void updateFlow_gaussianBlur(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+    GpuMat frames_[2];
+    GpuMat I_[2], M_, bufM_, R_[2], tmp_[2];
+    std::vector<GpuMat> pyramid0_, pyramid1_;
+};
+
+
 //! Interpolate frames (images) using provided optical flow (displacement field).
 //! frame0   - frame 0 (32-bit floating point images, single channel)
 //! frame1   - frame 1 (the same type and size)
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index 11ccda4452..569b460eab 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -81,6 +81,7 @@ namespace cv { namespace gpu
 
 struct Stream::Impl
 {
+    static cudaStream_t getStream(const Impl* impl) { return impl ? impl->stream : 0; }
     cudaStream_t stream;
     int ref_counter;
 };
@@ -95,7 +96,10 @@ namespace
     };
 }
 
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
+CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return Stream::Impl::getStream(stream.impl);
+};
 
 void cv::gpu::Stream::create()
 {
@@ -143,7 +147,7 @@ Stream& cv::gpu::Stream::operator=(const Stream& stream)
 
 bool cv::gpu::Stream::queryIfComplete()
 {
-    cudaError_t err = cudaStreamQuery( impl->stream );
+    cudaError_t err = cudaStreamQuery( Impl::getStream(impl) );
 
     if (err == cudaErrorNotReady || err == cudaSuccess)
         return err == cudaSuccess;
@@ -152,19 +156,19 @@ bool cv::gpu::Stream::queryIfComplete()
     return false;
 }
 
-void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( impl->stream ) ); }
+void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( Impl::getStream(impl) ) ); }
 
 void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
 {
     // if not -> allocation will be done, but after that dst will not point to page locked memory
     CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
-    devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
+    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
 }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
 
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToDevice); }
 
 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
 {
@@ -173,7 +177,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
 
     if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
     {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, impl->stream) );
+        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
         return;
     }
     if (src.depth() == CV_8U)
@@ -183,12 +187,12 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
         if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
         {
             int val = saturate_cast<uchar>(s[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, impl->stream) );
+            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
             return;
         }
     }
 
-    setTo(src, s, impl->stream);
+    setTo(src, s, Impl::getStream(impl));
 }
 
 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
@@ -198,7 +202,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 
     CV_Assert(mask.type() == CV_8UC1);
 
-    setTo(src, val, mask, impl->stream);
+    setTo(src, val, mask, Impl::getStream(impl));
 }
 
 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
@@ -226,7 +230,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
         psrc = &(temp = src);
 
     dst.create( src.size(), rtype );
-    convertTo(src, dst, alpha, beta, impl->stream);
+    convertTo(src, dst, alpha, beta, Impl::getStream(impl));
 }
 
 cv::gpu::Stream::operator bool() const
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index b8cf9cf366..d796bc355e 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -65,6 +65,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/core/internal.hpp"
+#include "opencv2/video/video.hpp"
 
 #define OPENCV_GPU_UNUSED(x) (void)x
 
diff --git a/modules/gpu/test/test_precomp.hpp b/modules/gpu/test/test_precomp.hpp
index 8d9cc43f59..8826457162 100644
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@@ -44,6 +44,7 @@
 
 #include <cmath>
 #include <cstdio>
+#include <iostream>
 #include <fstream>
 #include <sstream>
 #include <limits>
diff --git a/modules/gpu/test/test_video.cpp b/modules/gpu/test/test_video.cpp
index b29036c2b0..9cda85111d 100644
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@@ -423,3 +423,74 @@ TEST_P(PyrLKOpticalFlowSparse, Accuracy)
 INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, Combine(ALL_DEVICES, Bool()));
 
 #endif // HAVE_CUDA
+
+
+PARAM_TEST_CASE(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo, double, int, int, bool)
+{
+    Mat frame0, frame1;
+
+    double pyrScale;
+    int polyN;
+    double polySigma;
+    int flags;
+    bool useInitFlow;
+
+    virtual void SetUp()
+    {
+        frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+        frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame1.empty());
+
+        cv::gpu::setDevice(GET_PARAM(0).deviceID());
+
+        pyrScale = GET_PARAM(1);
+        polyN = GET_PARAM(2);
+        polySigma = polyN <= 5 ? 1.1 : 1.5;
+        flags = GET_PARAM(3);
+        useInitFlow = GET_PARAM(4);
+    }
+};
+
+TEST_P(FarnebackOpticalFlowTest, Accuracy)
+{
+    using namespace cv;
+
+    gpu::FarnebackOpticalFlow calc;
+    calc.pyrScale = pyrScale;
+    calc.polyN = polyN;
+    calc.polySigma = polySigma;
+    calc.flags = flags;
+
+    gpu::GpuMat d_flowx, d_flowy;
+    calc(gpu::GpuMat(frame0), gpu::GpuMat(frame1), d_flowx, d_flowy);
+
+    Mat flow;
+    if (useInitFlow)
+    {
+        Mat flowxy[] = {(Mat)d_flowx, (Mat)d_flowy};
+        merge(flowxy, 2, flow);
+    }
+
+    if (useInitFlow)
+    {
+        calc.flags |= OPTFLOW_USE_INITIAL_FLOW;
+        calc(gpu::GpuMat(frame0), gpu::GpuMat(frame1), d_flowx, d_flowy);
+    }
+
+    calcOpticalFlowFarneback(
+                frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize,
+                calc.numIters,  calc.polyN, calc.polySigma, calc.flags);
+
+    std::vector<Mat> flowxy; split(flow, flowxy);
+    /*std::cout << checkSimilarity(flowxy[0], (Mat)d_flowx) << " "
+              << checkSimilarity(flowxy[1], (Mat)d_flowy) << std::endl;*/
+    EXPECT_LT(checkSimilarity(flowxy[0], (Mat)d_flowx), 0.1);
+    EXPECT_LT(checkSimilarity(flowxy[1], (Mat)d_flowy), 0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest,
+                        Combine(ALL_DEVICES,
+                                Values(0.3, 0.5, 0.8),
+                                Values(5, 7),
+                                Values(0, (int)cv::OPTFLOW_FARNEBACK_GAUSSIAN),
+                                Values(false, true)));
diff --git a/samples/gpu/basketball1.png b/samples/gpu/basketball1.png
new file mode 100644
index 0000000000..53b2dbaad1
Binary files /dev/null and b/samples/gpu/basketball1.png differ
diff --git a/samples/gpu/basketball2.png b/samples/gpu/basketball2.png
new file mode 100644
index 0000000000..1d069b965c
Binary files /dev/null and b/samples/gpu/basketball2.png differ
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 57583c4052..1a99fa8b31 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1183,3 +1183,36 @@ TEST(PyrLKOpticalFlow)
         GPU_OFF;
     }
 }
+
+
+TEST(FarnebackOpticalFlow)
+{
+    const string names[] = {"rubberwhale", "basketball"};
+    for (size_t i = 0; i < sizeof(names)/sizeof(*names); ++i) {
+    for (int fastPyramids = 0; fastPyramids < 2; ++fastPyramids) {
+    for (int useGaussianBlur = 0; useGaussianBlur < 2; ++useGaussianBlur) {
+
+    SUBTEST << "dataset=" << names[i] << ", fastPyramids=" << fastPyramids << ", useGaussianBlur=" << useGaussianBlur;
+    Mat frame0 = imread(abspath(names[i] + "1.png"), IMREAD_GRAYSCALE);
+    Mat frame1 = imread(abspath(names[i] + "2.png"), IMREAD_GRAYSCALE);
+    if (frame0.empty()) throw runtime_error("can't open " + names[i] + "1.png");
+    if (frame1.empty()) throw runtime_error("can't open " + names[i] + "2.png");
+
+    gpu::FarnebackOpticalFlow calc;
+    calc.fastPyramids = fastPyramids;
+    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;
+
+    gpu::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;
+    calc(d_frame0, d_frame1, d_flowx, d_flowy);
+    GPU_ON;
+    calc(d_frame0, d_frame1, d_flowx, d_flowy);
+    GPU_OFF;
+
+    Mat flow;
+    calcOpticalFlowFarneback(frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize, calc.numIters, calc.polyN, calc.polySigma, calc.flags);
+    CPU_ON;
+    calcOpticalFlowFarneback(frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize, calc.numIters, calc.polyN, calc.polySigma, calc.flags);
+    CPU_OFF;
+
+    }}}
+}