diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp index 7882a8e622..6ea75594d2 100644 --- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp +++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp @@ -61,49 +61,94 @@ namespace cv { namespace cuda { //! @addtogroup cudaoptflow //! @{ -/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm -(@cite Brox2004). : +// +// Interface +// + +/** @brief Base interface for dense optical flow algorithms. */ -class CV_EXPORTS BroxOpticalFlow +class CV_EXPORTS DenseOpticalFlow : public Algorithm { public: - BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) : - alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), - inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_) - { - } + /** @brief Calculates a dense optical flow. - //! Compute optical flow - //! frame0 - source frame (supports only CV_32FC1 type) - //! frame1 - frame to track (with the same size and type as frame0) - //! u - flow horizontal component (along x axis) - //! v - flow vertical component (along y axis) - void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null()); - - //! flow smoothness - float alpha; - - //! gradient constancy importance - float gamma; - - //! pyramid scale factor - float scale_factor; - - //! number of lagged non-linearity iterations (inner loop) - int inner_iterations; - - //! number of warping iterations (number of pyramid levels) - int outer_iterations; - - //! number of linear system solver iterations - int solver_iterations; - - GpuMat buf; + @param I0 first input image. + @param I1 second input image of the same size and the same type as I0. + @param flow computed flow image that has the same size as I0 and type CV_32FC2. + @param stream Stream for the asynchronous version. + */ + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0; }; -/** @brief Class used for calculating an optical flow. +/** @brief Base interface for sparse optical flow algorithms. + */ +class CV_EXPORTS SparseOpticalFlow : public Algorithm +{ +public: + /** @brief Calculates a sparse optical flow. -The class can calculate an optical flow for a sparse feature set or dense optical flow using the + @param prevImg First input image. + @param nextImg Second input image of the same size and the same type as prevImg. + @param prevPts Vector of 2D points for which the flow needs to be found. + @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image. + @param status Output status vector. Each element of the vector is set to 1 if the + flow for the corresponding features has been found. Otherwise, it is set to 0. + @param err Optional output vector that contains error response for each point (inverse confidence). + @param stream Stream for the asynchronous version. + */ + virtual void calc(InputArray prevImg, InputArray nextImg, + InputArray prevPts, InputOutputArray nextPts, + OutputArray status, + OutputArray err = cv::noArray(), + Stream& stream = Stream::Null()) = 0; +}; + +// +// BroxOpticalFlow +// + +/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004). + */ +class CV_EXPORTS BroxOpticalFlow : public DenseOpticalFlow +{ +public: + virtual double getFlowSmoothness() const = 0; + virtual void setFlowSmoothness(double alpha) = 0; + + virtual double getGradientConstancyImportance() const = 0; + virtual void setGradientConstancyImportance(double gamma) = 0; + + virtual double getPyramidScaleFactor() const = 0; + virtual void setPyramidScaleFactor(double scale_factor) = 0; + + //! number of lagged non-linearity iterations (inner loop) + virtual int getInnerIterations() const = 0; + virtual void setInnerIterations(int inner_iterations) = 0; + + //! number of warping iterations (number of pyramid levels) + virtual int getOuterIterations() const = 0; + virtual void setOuterIterations(int outer_iterations) = 0; + + //! number of linear system solver iterations + virtual int getSolverIterations() const = 0; + virtual void setSolverIterations(int solver_iterations) = 0; + + static Ptr create( + double alpha = 0.197, + double gamma = 50.0, + double scale_factor = 0.8, + int inner_iterations = 5, + int outer_iterations = 150, + int solver_iterations = 10); +}; + +// +// PyrLKOpticalFlow +// + +/** @brief Class used for calculating a sparse optical flow. + +The class can calculate an optical flow for a sparse feature set using the iterative Lucas-Kanade method with pyramids. @sa calcOpticalFlowPyrLK @@ -112,158 +157,116 @@ iterative Lucas-Kanade method with pyramids. - An example of the Lucas Kanade optical flow algorithm can be found at opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp */ -class CV_EXPORTS PyrLKOpticalFlow +class CV_EXPORTS SparsePyrLKOpticalFlow : public SparseOpticalFlow { public: - PyrLKOpticalFlow(); + virtual Size getWinSize() const = 0; + virtual void setWinSize(Size winSize) = 0; - /** @brief Calculate an optical flow for a sparse feature set. + virtual int getMaxLevel() const = 0; + virtual void setMaxLevel(int maxLevel) = 0; - @param prevImg First 8-bit input image (supports both grayscale and color images). - @param nextImg Second input image of the same size and the same type as prevImg . - @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix - with CV_32FC2 type. - @param nextPts Output vector of 2D points (with single-precision floating-point coordinates) - containing the calculated new positions of input features in the second image. When useInitialFlow - is true, the vector must have the same size as in the input. - @param status Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the - flow for the corresponding features has been found. Otherwise, it is set to 0. - @param err Output vector (CV_32FC1 type) that contains the difference between patches around the - original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not - needed. + virtual int getNumIters() const = 0; + virtual void setNumIters(int iters) = 0; - @sa calcOpticalFlowPyrLK - */ - void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, - GpuMat& status, GpuMat* err = 0); + virtual bool getUseInitialFlow() const = 0; + virtual void setUseInitialFlow(bool useInitialFlow) = 0; - /** @brief Calculate dense optical flow. - - @param prevImg First 8-bit grayscale input image. - @param nextImg Second input image of the same size and the same type as prevImg . - @param u Horizontal component of the optical flow of the same size as input images, 32-bit - floating-point, single-channel - @param v Vertical component of the optical flow of the same size as input images, 32-bit - floating-point, single-channel - @param err Output vector (CV_32FC1 type) that contains the difference between patches around the - original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not - needed. - */ - void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0); - - /** @brief Releases inner buffers memory. - */ - void releaseMemory(); - - Size winSize; - int maxLevel; - int iters; - bool useInitialFlow; - -private: - std::vector prevPyr_; - std::vector nextPyr_; - - GpuMat buf_; - - GpuMat uPyr_[2]; - GpuMat vPyr_[2]; + static Ptr create( + Size winSize = Size(21, 21), + int maxLevel = 3, + int iters = 30, + bool useInitialFlow = false); }; -/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. : +/** @brief Class used for calculating a dense optical flow. + +The class can calculate an optical flow for a dense optical flow using the +iterative Lucas-Kanade method with pyramids. */ -class CV_EXPORTS FarnebackOpticalFlow +class CV_EXPORTS DensePyrLKOpticalFlow : public DenseOpticalFlow { public: - FarnebackOpticalFlow() - { - numLevels = 5; - pyrScale = 0.5; - fastPyramids = false; - winSize = 13; - numIters = 10; - polyN = 5; - polySigma = 1.1; - flags = 0; - } + virtual Size getWinSize() const = 0; + virtual void setWinSize(Size winSize) = 0; - int numLevels; - double pyrScale; - bool fastPyramids; - int winSize; - int numIters; - int polyN; - double polySigma; - int flags; + virtual int getMaxLevel() const = 0; + virtual void setMaxLevel(int maxLevel) = 0; - /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm. + virtual int getNumIters() const = 0; + virtual void setNumIters(int iters) = 0; - @param frame0 First 8-bit gray-scale input image - @param frame1 Second 8-bit gray-scale input image - @param flowx Flow horizontal component - @param flowy Flow vertical component - @param s Stream + virtual bool getUseInitialFlow() const = 0; + virtual void setUseInitialFlow(bool useInitialFlow) = 0; - @sa calcOpticalFlowFarneback - */ - void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null()); - - /** @brief Releases unused auxiliary memory buffers. - */ - void releaseMemory() - { - frames_[0].release(); - frames_[1].release(); - pyrLevel_[0].release(); - pyrLevel_[1].release(); - M_.release(); - bufM_.release(); - R_[0].release(); - R_[1].release(); - blurredFrame_[0].release(); - blurredFrame_[1].release(); - pyramid0_.clear(); - pyramid1_.clear(); - } - -private: - void prepareGaussian( - int n, double sigma, float *g, float *xg, float *xxg, - double &ig11, double &ig03, double &ig33, double &ig55); - - void setPolynomialExpansionConsts(int n, double sigma); - - void updateFlow_boxFilter( - const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, - GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]); - - void updateFlow_gaussianBlur( - const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, - GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]); - - GpuMat frames_[2]; - GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; - std::vector pyramid0_, pyramid1_; + static Ptr create( + Size winSize = Size(13, 13), + int maxLevel = 3, + int iters = 30, + bool useInitialFlow = false); }; -// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method // -// see reference: -// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". -// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". -class CV_EXPORTS OpticalFlowDual_TVL1_CUDA +// FarnebackOpticalFlow +// + +/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. + */ +class CV_EXPORTS FarnebackOpticalFlow : public DenseOpticalFlow { public: - OpticalFlowDual_TVL1_CUDA(); + virtual int getNumLevels() const = 0; + virtual void setNumLevels(int numLevels) = 0; - void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy); + virtual double getPyrScale() const = 0; + virtual void setPyrScale(double pyrScale) = 0; - void collectGarbage(); + virtual bool getFastPyramids() const = 0; + virtual void setFastPyramids(bool fastPyramids) = 0; + virtual int getWinSize() const = 0; + virtual void setWinSize(int winSize) = 0; + + virtual int getNumIters() const = 0; + virtual void setNumIters(int numIters) = 0; + + virtual int getPolyN() const = 0; + virtual void setPolyN(int polyN) = 0; + + virtual double getPolySigma() const = 0; + virtual void setPolySigma(double polySigma) = 0; + + virtual int getFlags() const = 0; + virtual void setFlags(int flags) = 0; + + static Ptr create( + int numLevels = 5, + double pyrScale = 0.5, + bool fastPyramids = false, + int winSize = 13, + int numIters = 10, + int polyN = 5, + double polySigma = 1.1, + int flags = 0); +}; + +// +// OpticalFlowDual_TVL1 +// + +/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method. + * + * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". + * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". + */ +class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow +{ +public: /** * Time step of the numerical scheme. */ - double tau; + virtual double getTau() const = 0; + virtual void setTau(double tau) = 0; /** * Weight parameter for the data term, attachment parameter. @@ -271,7 +274,8 @@ public: * The smaller this parameter is, the smoother the solutions we obtain. * It depends on the range of motions of the images, so its value should be adapted to each image sequence. */ - double lambda; + virtual double getLambda() const = 0; + virtual void setLambda(double lambda) = 0; /** * Weight parameter for (u - v)^2, tightness parameter. @@ -279,20 +283,23 @@ public: * In theory, it should have a small value in order to maintain both parts in correspondence. * The method is stable for a large range of values of this parameter. */ + virtual double getGamma() const = 0; + virtual void setGamma(double gamma) = 0; - double gamma; /** - * parameter used for motion estimation. It adds a variable allowing for illumination variations - * Set this parameter to 1. if you have varying illumination. - * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging - * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145 - */ - double theta; + * parameter used for motion estimation. It adds a variable allowing for illumination variations + * Set this parameter to 1. if you have varying illumination. + * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging + * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145 + */ + virtual double getTheta() const = 0; + virtual void setTheta(double theta) = 0; /** * Number of scales used to create the pyramid of images. */ - int nscales; + virtual int getNumScales() const = 0; + virtual void setNumScales(int nscales) = 0; /** * Number of warpings per scale. @@ -300,51 +307,39 @@ public: * This is a parameter that assures the stability of the method. * It also affects the running time, so it is a compromise between speed and accuracy. */ - int warps; + virtual int getNumWarps() const = 0; + virtual void setNumWarps(int warps) = 0; /** * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. * A small value will yield more accurate solutions at the expense of a slower convergence. */ - double epsilon; + virtual double getEpsilon() const = 0; + virtual void setEpsilon(double epsilon) = 0; /** * Stopping criterion iterations number used in the numerical scheme. */ - int iterations; + virtual int getNumIterations() const = 0; + virtual void setNumIterations(int iterations) = 0; - double scaleStep; + virtual double getScaleStep() const = 0; + virtual void setScaleStep(double scaleStep) = 0; - bool useInitialFlow; + virtual bool getUseInitialFlow() const = 0; + virtual void setUseInitialFlow(bool useInitialFlow) = 0; -private: - void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3); - - std::vector I0s; - std::vector I1s; - std::vector u1s; - std::vector u2s; - std::vector u3s; - - GpuMat I1x_buf; - GpuMat I1y_buf; - - GpuMat I1w_buf; - GpuMat I1wx_buf; - GpuMat I1wy_buf; - - GpuMat grad_buf; - GpuMat rho_c_buf; - - GpuMat p11_buf; - GpuMat p12_buf; - GpuMat p21_buf; - GpuMat p22_buf; - GpuMat p31_buf; - GpuMat p32_buf; - - GpuMat diff_buf; - GpuMat norm_buf; + static Ptr create( + double tau = 0.25, + double lambda = 0.15, + double theta = 0.3, + int nscales = 5, + int warps = 5, + double epsilon = 0.01, + int iterations = 300, + double scaleStep = 0.8, + double gamma = 0.0, + bool useInitialFlow = false); }; //! @} diff --git a/modules/cudaoptflow/src/brox.cpp b/modules/cudaoptflow/src/brox.cpp index 39eae9a8ab..11c541906b 100644 --- a/modules/cudaoptflow/src/brox.cpp +++ b/modules/cudaoptflow/src/brox.cpp @@ -47,84 +47,148 @@ using namespace cv::cuda; #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER) -void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } +Ptr cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr(); } #else -namespace -{ - size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix& frame0, const NCVMatrix& frame1, - NCVMatrix& u, NCVMatrix& v, const cudaDeviceProp& devProp) +namespace { + + class BroxOpticalFlowImpl : public BroxOpticalFlow { - NCVMemStackAllocator gpuCounter(static_cast(devProp.textureAlignment)); + public: + BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor, + int inner_iterations, int outer_iterations, int solver_iterations) : + alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor), + inner_iterations_(inner_iterations), outer_iterations_(outer_iterations), + solver_iterations_(solver_iterations) + { + } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream); + + virtual double getFlowSmoothness() const { return alpha_; } + virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast(alpha); } + + virtual double getGradientConstancyImportance() const { return gamma_; } + virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast(gamma); } + + virtual double getPyramidScaleFactor() const { return scale_factor_; } + virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast(scale_factor); } + + //! number of lagged non-linearity iterations (inner loop) + virtual int getInnerIterations() const { return inner_iterations_; } + virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; } + + //! number of warping iterations (number of pyramid levels) + virtual int getOuterIterations() const { return outer_iterations_; } + virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; } + + //! number of linear system solver iterations + virtual int getSolverIterations() const { return solver_iterations_; } + virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; } + + private: + //! flow smoothness + float alpha_; + + //! gradient constancy importance + float gamma_; + + //! pyramid scale factor + float scale_factor_; + + //! number of lagged non-linearity iterations (inner loop) + int inner_iterations_; + + //! number of warping iterations (number of pyramid levels) + int outer_iterations_; + + //! number of linear system solver iterations + int solver_iterations_; + }; + + static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, + const NCVMatrix& frame0, const NCVMatrix& frame1, + NCVMatrix& u, NCVMatrix& v, + size_t textureAlignment) + { + NCVMemStackAllocator gpuCounter(static_cast(textureAlignment)); ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) ); return gpuCounter.maxSize(); } + + static void outputHandler(const String &msg) + { + CV_Error(cv::Error::GpuApiCallError, msg.c_str()); + } + + void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream) + { + const GpuMat frame0 = _I0.getGpuMat(); + const GpuMat frame1 = _I1.getGpuMat(); + + CV_Assert( frame0.type() == CV_32FC1 ); + CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() ); + + ncvSetDebugOutputHandler(outputHandler); + + BufferPool pool(stream); + GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1); + GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1); + + NCVBroxOpticalFlowDescriptor desc; + desc.alpha = alpha_; + desc.gamma = gamma_; + desc.scale_factor = scale_factor_; + desc.number_of_inner_iterations = inner_iterations_; + desc.number_of_outer_iterations = outer_iterations_; + desc.number_of_solver_iterations = solver_iterations_; + + NCVMemSegment frame0MemSeg; + frame0MemSeg.begin.memtype = NCVMemoryTypeDevice; + frame0MemSeg.begin.ptr = const_cast(frame0.data); + frame0MemSeg.size = frame0.step * frame0.rows; + + NCVMemSegment frame1MemSeg; + frame1MemSeg.begin.memtype = NCVMemoryTypeDevice; + frame1MemSeg.begin.ptr = const_cast(frame1.data); + frame1MemSeg.size = frame1.step * frame1.rows; + + NCVMemSegment uMemSeg; + uMemSeg.begin.memtype = NCVMemoryTypeDevice; + uMemSeg.begin.ptr = u.ptr(); + uMemSeg.size = u.step * u.rows; + + NCVMemSegment vMemSeg; + vMemSeg.begin.memtype = NCVMemoryTypeDevice; + vMemSeg.begin.ptr = v.ptr(); + vMemSeg.size = v.step * v.rows; + + DeviceInfo devInfo; + size_t textureAlignment = devInfo.textureAlignment(); + + NCVMatrixReuse frame0Mat(frame0MemSeg, static_cast(textureAlignment), frame0.cols, frame0.rows, static_cast(frame0.step)); + NCVMatrixReuse frame1Mat(frame1MemSeg, static_cast(textureAlignment), frame1.cols, frame1.rows, static_cast(frame1.step)); + NCVMatrixReuse uMat(uMemSeg, static_cast(textureAlignment), u.cols, u.rows, static_cast(u.step)); + NCVMatrixReuse vMat(vMemSeg, static_cast(textureAlignment), v.cols, v.rows, static_cast(v.step)); + + size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), CV_8UC1); + + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast(textureAlignment), buf.ptr()); + + ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) ); + + GpuMat flows[] = {u, v}; + cuda::merge(flows, 2, _flow, stream); + } } -namespace +Ptr cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations) { - static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); } -} - -void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s) -{ - ncvSetDebugOutputHandler(outputHandler); - - CV_Assert(frame0.type() == CV_32FC1); - CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type()); - - u.create(frame0.size(), CV_32FC1); - v.create(frame0.size(), CV_32FC1); - - cudaDeviceProp devProp; - cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) ); - - NCVBroxOpticalFlowDescriptor desc; - - desc.alpha = alpha; - desc.gamma = gamma; - desc.scale_factor = scale_factor; - desc.number_of_inner_iterations = inner_iterations; - desc.number_of_outer_iterations = outer_iterations; - desc.number_of_solver_iterations = solver_iterations; - - NCVMemSegment frame0MemSeg; - frame0MemSeg.begin.memtype = NCVMemoryTypeDevice; - frame0MemSeg.begin.ptr = const_cast(frame0.data); - frame0MemSeg.size = frame0.step * frame0.rows; - - NCVMemSegment frame1MemSeg; - frame1MemSeg.begin.memtype = NCVMemoryTypeDevice; - frame1MemSeg.begin.ptr = const_cast(frame1.data); - frame1MemSeg.size = frame1.step * frame1.rows; - - NCVMemSegment uMemSeg; - uMemSeg.begin.memtype = NCVMemoryTypeDevice; - uMemSeg.begin.ptr = u.ptr(); - uMemSeg.size = u.step * u.rows; - - NCVMemSegment vMemSeg; - vMemSeg.begin.memtype = NCVMemoryTypeDevice; - vMemSeg.begin.ptr = v.ptr(); - vMemSeg.size = v.step * v.rows; - - NCVMatrixReuse frame0Mat(frame0MemSeg, static_cast(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast(frame0.step)); - NCVMatrixReuse frame1Mat(frame1MemSeg, static_cast(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast(frame1.step)); - NCVMatrixReuse uMat(uMemSeg, static_cast(devProp.textureAlignment), u.cols, u.rows, static_cast(u.step)); - NCVMatrixReuse vMat(vMemSeg, static_cast(devProp.textureAlignment), v.cols, v.rows, static_cast(v.step)); - - cudaStream_t stream = StreamAccessor::getStream(s); - - size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp); - - ensureSizeIsEnough(1, static_cast(bufSize), CV_8UC1, buf); - - NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast(devProp.textureAlignment), buf.ptr()); - - ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) ); + return makePtr(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations); } #endif /* HAVE_CUDA */ diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu index d4606f2281..7693551fca 100644 --- a/modules/cudaoptflow/src/cuda/pyrlk.cu +++ b/modules/cudaoptflow/src/cuda/pyrlk.cu @@ -472,16 +472,16 @@ namespace pyrlk } } - void loadConstants(int2 winSize, int iters) + void loadConstants(int2 winSize, int iters, cudaStream_t stream) { - cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); - cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); - cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); } void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu index 2b66c972bc..66f0d664a0 100644 --- a/modules/cudaoptflow/src/cuda/tvl1flow.cu +++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu @@ -66,15 +66,16 @@ namespace tvl1flow dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x)); } - void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy) + void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - centeredGradientKernel<<>>(src, dx, dy); + centeredGradientKernel<<>>(src, dx, dy); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -164,7 +165,10 @@ namespace tvl1flow rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; } - void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho) + void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, + PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, + PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho, + cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); @@ -173,10 +177,11 @@ namespace tvl1flow bindTexture(&tex_I1x, I1x); bindTexture(&tex_I1y, I1y); - warpBackwardKernel<<>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); + warpBackwardKernel<<>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -292,15 +297,17 @@ namespace tvl1flow PtrStepSzf grad, PtrStepSzf rho_c, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error, - float l_t, float theta, float gamma, bool calcError) + float l_t, float theta, float gamma, bool calcError, + cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y)); - estimateUKernel<<>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError); + estimateUKernel<<>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -346,15 +353,19 @@ namespace tvl1flow } } - void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma) + void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, + PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, + float taut, float gamma, + cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y)); - estimateDualVariablesKernel<<>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma); + estimateDualVariablesKernel<<>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); } } diff --git a/modules/cudaoptflow/src/farneback.cpp b/modules/cudaoptflow/src/farneback.cpp index 6b74432632..b7fefeb191 100644 --- a/modules/cudaoptflow/src/farneback.cpp +++ b/modules/cudaoptflow/src/farneback.cpp @@ -42,23 +42,21 @@ #include "precomp.hpp" -#define MIN_SIZE 32 - -#define S(x) StreamAccessor::getStream(x) - -// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag -// leads to an inefficient code. It's for debug purposes only. -#define ENABLE_CUDA_RESIZE 1 - using namespace cv; using namespace cv::cuda; #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -void cv::cuda::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } +Ptr cv::cuda::FarnebackOpticalFlow::create(int, double, bool, int, int, int, double, int) { throw_no_cuda(); return Ptr(); } #else +#define MIN_SIZE 32 + +// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag +// leads to an inefficient code. It's for debug purposes only. +#define ENABLE_CUDA_RESIZE 1 + namespace cv { namespace cuda { namespace device { namespace optflow_farneback { void setPolynomialExpansionConsts( @@ -76,8 +74,6 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback void updateFlowGpu( const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream); - /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/ - void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream); void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream); @@ -93,10 +89,93 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback void gaussianBlur5Gpu_CC11( const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream); -}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback +}}}} namespace { + class FarnebackOpticalFlowImpl : public FarnebackOpticalFlow + { + public: + FarnebackOpticalFlowImpl(int numLevels, double pyrScale, bool fastPyramids, int winSize, + int numIters, int polyN, double polySigma, int flags) : + numLevels_(numLevels), pyrScale_(pyrScale), fastPyramids_(fastPyramids), winSize_(winSize), + numIters_(numIters), polyN_(polyN), polySigma_(polySigma), flags_(flags) + { + } + + virtual int getNumLevels() const { return numLevels_; } + virtual void setNumLevels(int numLevels) { numLevels_ = numLevels; } + + virtual double getPyrScale() const { return pyrScale_; } + virtual void setPyrScale(double pyrScale) { pyrScale_ = pyrScale; } + + virtual bool getFastPyramids() const { return fastPyramids_; } + virtual void setFastPyramids(bool fastPyramids) { fastPyramids_ = fastPyramids; } + + virtual int getWinSize() const { return winSize_; } + virtual void setWinSize(int winSize) { winSize_ = winSize; } + + virtual int getNumIters() const { return numIters_; } + virtual void setNumIters(int numIters) { numIters_ = numIters; } + + virtual int getPolyN() const { return polyN_; } + virtual void setPolyN(int polyN) { polyN_ = polyN; } + + virtual double getPolySigma() const { return polySigma_; } + virtual void setPolySigma(double polySigma) { polySigma_ = polySigma; } + + virtual int getFlags() const { return flags_; } + virtual void setFlags(int flags) { flags_ = flags; } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream); + + private: + int numLevels_; + double pyrScale_; + bool fastPyramids_; + int winSize_; + int numIters_; + int polyN_; + double polySigma_; + int flags_; + + private: + void prepareGaussian( + int n, double sigma, float *g, float *xg, float *xxg, + double &ig11, double &ig03, double &ig33, double &ig55); + + void setPolynomialExpansionConsts(int n, double sigma); + + void updateFlow_boxFilter( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]); + + void updateFlow_gaussianBlur( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]); + + void calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream); + + GpuMat frames_[2]; + GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; + std::vector pyramid0_, pyramid1_; + }; + + void FarnebackOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream) + { + const GpuMat frame0 = _frame0.getGpuMat(); + const GpuMat frame1 = _frame1.getGpuMat(); + + BufferPool pool(stream); + GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1); + GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1); + + calcImpl(frame0, frame1, flowx, flowy, stream); + + GpuMat flows[] = {flowx, flowy}; + cuda::merge(flows, 2, _flow, stream); + } + GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat) { if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) @@ -104,285 +183,287 @@ namespace return mat = GpuMat(rows, cols, type); } -} -void cv::cuda::FarnebackOpticalFlow::prepareGaussian( - int n, double sigma, float *g, float *xg, float *xxg, - double &ig11, double &ig03, double &ig33, double &ig55) -{ - double s = 0.; - for (int x = -n; x <= n; x++) - { - g[x] = (float)std::exp(-x*x/(2*sigma*sigma)); - s += g[x]; - } - - s = 1./s; - for (int x = -n; x <= n; x++) - { - g[x] = (float)(g[x]*s); - xg[x] = (float)(x*g[x]); - xxg[x] = (float)(x*x*g[x]); - } - - Mat_ G(6, 6); - G.setTo(0); - - for (int y = -n; y <= n; y++) + void FarnebackOpticalFlowImpl::prepareGaussian( + int n, double sigma, float *g, float *xg, float *xxg, + double &ig11, double &ig03, double &ig33, double &ig55) { + double s = 0.; for (int x = -n; x <= n; x++) { - G(0,0) += g[y]*g[x]; - G(1,1) += g[y]*g[x]*x*x; - G(3,3) += g[y]*g[x]*x*x*x*x; - G(5,5) += g[y]*g[x]*x*x*y*y; - } - } - - //G[0][0] = 1.; - G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1); - G(4,4) = G(3,3); - G(3,4) = G(4,3) = G(5,5); - - // invG: - // [ x e e ] - // [ y ] - // [ y ] - // [ e z ] - // [ e z ] - // [ u ] - Mat_ invG = G.inv(DECOMP_CHOLESKY); - - ig11 = invG(1,1); - ig03 = invG(0,3); - ig33 = invG(3,3); - ig55 = invG(5,5); -} - - -void cv::cuda::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma) -{ - std::vector buf(n*6 + 3); - float* g = &buf[0] + n; - float* xg = g + n*2 + 1; - float* xxg = xg + n*2 + 1; - - if (sigma < FLT_EPSILON) - sigma = n*0.3; - - double ig11, ig03, ig33, ig55; - prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55); - - device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast(ig11), static_cast(ig03), static_cast(ig33), static_cast(ig55)); -} - - -void cv::cuda::FarnebackOpticalFlow::updateFlow_boxFilter( - const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, - GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) -{ - if (deviceSupports(FEATURE_SET_COMPUTE_12)) - device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0])); - else - device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0])); - swap(M, bufM); - - for (int i = 1; i < 5; ++i) - streams[i].waitForCompletion(); - device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0])); - - if (updateMatrices) - device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0])); -} - - -void cv::cuda::FarnebackOpticalFlow::updateFlow_gaussianBlur( - const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, - GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) -{ - if (deviceSupports(FEATURE_SET_COMPUTE_12)) - device::optflow_farneback::gaussianBlur5Gpu( - M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0])); - else - device::optflow_farneback::gaussianBlur5Gpu_CC11( - M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0])); - swap(M, bufM); - - device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0])); - - if (updateMatrices) - device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0])); -} - - -void cv::cuda::FarnebackOpticalFlow::operator ()( - const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s) -{ - CV_Assert(frame0.channels() == 1 && frame1.channels() == 1); - CV_Assert(frame0.size() == frame1.size()); - CV_Assert(polyN == 5 || polyN == 7); - CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6); - - Stream streams[5]; - if (S(s)) - streams[0] = s; - - Size size = frame0.size(); - GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY; - - flowx.create(size, CV_32F); - flowy.create(size, CV_32F); - GpuMat flowx0 = flowx; - GpuMat flowy0 = flowy; - - // Crop unnecessary levels - double scale = 1; - int numLevelsCropped = 0; - for (; numLevelsCropped < numLevels; numLevelsCropped++) - { - scale *= pyrScale; - if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE) - break; - } - - frame0.convertTo(frames_[0], CV_32F, streams[0]); - frame1.convertTo(frames_[1], CV_32F, streams[1]); - - if (fastPyramids) - { - // Build Gaussian pyramids using pyrDown() - pyramid0_.resize(numLevelsCropped + 1); - pyramid1_.resize(numLevelsCropped + 1); - pyramid0_[0] = frames_[0]; - pyramid1_[0] = frames_[1]; - for (int i = 1; i <= numLevelsCropped; ++i) - { - cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]); - cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]); - } - } - - setPolynomialExpansionConsts(polyN, polySigma); - device::optflow_farneback::setUpdateMatricesConsts(); - - for (int k = numLevelsCropped; k >= 0; k--) - { - streams[0].waitForCompletion(); - - scale = 1; - for (int i = 0; i < k; i++) - scale *= pyrScale; - - double sigma = (1./scale - 1) * 0.5; - int smoothSize = cvRound(sigma*5) | 1; - smoothSize = std::max(smoothSize, 3); - - int width = cvRound(size.width*scale); - int height = cvRound(size.height*scale); - - if (fastPyramids) - { - width = pyramid0_[k].cols; - height = pyramid0_[k].rows; + g[x] = (float)std::exp(-x*x/(2*sigma*sigma)); + s += g[x]; } - if (k > 0) + s = 1./s; + for (int x = -n; x <= n; x++) { - curFlowX.create(height, width, CV_32F); - curFlowY.create(height, width, CV_32F); - } - else - { - curFlowX = flowx0; - curFlowY = flowy0; + g[x] = (float)(g[x]*s); + xg[x] = (float)(x*g[x]); + xxg[x] = (float)(x*x*g[x]); } - if (!prevFlowX.data) + Mat_ G(6, 6); + G.setTo(0); + + for (int y = -n; y <= n; y++) { - if (flags & OPTFLOW_USE_INITIAL_FLOW) + for (int x = -n; x <= n; x++) { - cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); - cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); - curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]); - curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]); + G(0,0) += g[y]*g[x]; + G(1,1) += g[y]*g[x]*x*x; + G(3,3) += g[y]*g[x]*x*x*x*x; + G(5,5) += g[y]*g[x]*x*x*y*y; + } + } + + //G[0][0] = 1.; + G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1); + G(4,4) = G(3,3); + G(3,4) = G(4,3) = G(5,5); + + // invG: + // [ x e e ] + // [ y ] + // [ y ] + // [ e z ] + // [ e z ] + // [ u ] + Mat_ invG = G.inv(DECOMP_CHOLESKY); + + ig11 = invG(1,1); + ig03 = invG(0,3); + ig33 = invG(3,3); + ig55 = invG(5,5); + } + + void FarnebackOpticalFlowImpl::setPolynomialExpansionConsts(int n, double sigma) + { + std::vector buf(n*6 + 3); + float* g = &buf[0] + n; + float* xg = g + n*2 + 1; + float* xxg = xg + n*2 + 1; + + if (sigma < FLT_EPSILON) + sigma = n*0.3; + + double ig11, ig03, ig33, ig55; + prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55); + + device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast(ig11), static_cast(ig03), static_cast(ig33), static_cast(ig55)); + } + + void FarnebackOpticalFlowImpl::updateFlow_boxFilter( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) + { + if (deviceSupports(FEATURE_SET_COMPUTE_12)) + device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0])); + else + device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0])); + swap(M, bufM); + + for (int i = 1; i < 5; ++i) + streams[i].waitForCompletion(); + device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0])); + + if (updateMatrices) + device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0])); + } + + void FarnebackOpticalFlowImpl::updateFlow_gaussianBlur( + const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, + GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) + { + if (deviceSupports(FEATURE_SET_COMPUTE_12)) + device::optflow_farneback::gaussianBlur5Gpu( + M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0])); + else + device::optflow_farneback::gaussianBlur5Gpu_CC11( + M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0])); + swap(M, bufM); + + device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0])); + + if (updateMatrices) + device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0])); + } + + void FarnebackOpticalFlowImpl::calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream) + { + CV_Assert(frame0.channels() == 1 && frame1.channels() == 1); + CV_Assert(frame0.size() == frame1.size()); + CV_Assert(polyN_ == 5 || polyN_ == 7); + CV_Assert(!fastPyramids_ || std::abs(pyrScale_ - 0.5) < 1e-6); + + Stream streams[5]; + if (stream) + streams[0] = stream; + + Size size = frame0.size(); + GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY; + + flowx.create(size, CV_32F); + flowy.create(size, CV_32F); + GpuMat flowx0 = flowx; + GpuMat flowy0 = flowy; + + // Crop unnecessary levels + double scale = 1; + int numLevelsCropped = 0; + for (; numLevelsCropped < numLevels_; numLevelsCropped++) + { + scale *= pyrScale_; + if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE) + break; + } + + frame0.convertTo(frames_[0], CV_32F, streams[0]); + frame1.convertTo(frames_[1], CV_32F, streams[1]); + + if (fastPyramids_) + { + // Build Gaussian pyramids using pyrDown() + pyramid0_.resize(numLevelsCropped + 1); + pyramid1_.resize(numLevelsCropped + 1); + pyramid0_[0] = frames_[0]; + pyramid1_[0] = frames_[1]; + for (int i = 1; i <= numLevelsCropped; ++i) + { + cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]); + cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]); + } + } + + setPolynomialExpansionConsts(polyN_, polySigma_); + device::optflow_farneback::setUpdateMatricesConsts(); + + for (int k = numLevelsCropped; k >= 0; k--) + { + streams[0].waitForCompletion(); + + scale = 1; + for (int i = 0; i < k; i++) + scale *= pyrScale_; + + double sigma = (1./scale - 1) * 0.5; + int smoothSize = cvRound(sigma*5) | 1; + smoothSize = std::max(smoothSize, 3); + + int width = cvRound(size.width*scale); + int height = cvRound(size.height*scale); + + if (fastPyramids_) + { + width = pyramid0_[k].cols; + height = pyramid0_[k].rows; + } + + if (k > 0) + { + curFlowX.create(height, width, CV_32F); + curFlowY.create(height, width, CV_32F); } else { - curFlowX.setTo(0, streams[0]); - curFlowY.setTo(0, streams[1]); + curFlowX = flowx0; + curFlowY = flowy0; } - } - else - { - cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); - cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); - curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]); - curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]); - } - GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); - GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_); - GpuMat R[2] = - { - allocMatFromBuf(5*height, width, CV_32F, R_[0]), - allocMatFromBuf(5*height, width, CV_32F, R_[1]) - }; - - if (fastPyramids) - { - device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0])); - device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1])); - } - else - { - GpuMat blurredFrame[2] = + if (!prevFlowX.data) { - allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]), - allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1]) - }; - GpuMat pyrLevel[2] = - { - allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]), - allocMatFromBuf(height, width, CV_32F, pyrLevel_[1]) - }; - - Mat g = getGaussianKernel(smoothSize, sigma, CV_32F); - device::optflow_farneback::setGaussianBlurKernel(g.ptr(smoothSize/2), smoothSize/2); - - for (int i = 0; i < 2; i++) - { - device::optflow_farneback::gaussianBlurGpu( - frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i])); - cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]); - device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i])); + if (flags_ & OPTFLOW_USE_INITIAL_FLOW) + { + cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); + cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); + curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]); + curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]); + } + else + { + curFlowX.setTo(0, streams[0]); + curFlowY.setTo(0, streams[1]); + } } - } - - streams[1].waitForCompletion(); - device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0])); - - if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) - { - Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F); - device::optflow_farneback::setGaussianBlurKernel(g.ptr(winSize/2), winSize/2); - } - for (int i = 0; i < numIters; i++) - { - if (flags & OPTFLOW_FARNEBACK_GAUSSIAN) - updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); else - updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams); + { + cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]); + cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]); + curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale_, streams[0]); + curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale_, streams[1]); + } + + GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_); + GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_); + GpuMat R[2] = + { + allocMatFromBuf(5*height, width, CV_32F, R_[0]), + allocMatFromBuf(5*height, width, CV_32F, R_[1]) + }; + + if (fastPyramids_) + { + device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN_, R[0], StreamAccessor::getStream(streams[0])); + device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN_, R[1], StreamAccessor::getStream(streams[1])); + } + else + { + GpuMat blurredFrame[2] = + { + allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]), + allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1]) + }; + GpuMat pyrLevel[2] = + { + allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]), + allocMatFromBuf(height, width, CV_32F, pyrLevel_[1]) + }; + + Mat g = getGaussianKernel(smoothSize, sigma, CV_32F); + device::optflow_farneback::setGaussianBlurKernel(g.ptr(smoothSize/2), smoothSize/2); + + for (int i = 0; i < 2; i++) + { + device::optflow_farneback::gaussianBlurGpu( + frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, StreamAccessor::getStream(streams[i])); + cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]); + device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN_, R[i], StreamAccessor::getStream(streams[i])); + } + } + + streams[1].waitForCompletion(); + device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, StreamAccessor::getStream(streams[0])); + + if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN) + { + Mat g = getGaussianKernel(winSize_, winSize_/2*0.3f, CV_32F); + device::optflow_farneback::setGaussianBlurKernel(g.ptr(winSize_/2), winSize_/2); + } + for (int i = 0; i < numIters_; i++) + { + if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN) + updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams); + else + updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams); + } + + prevFlowX = curFlowX; + prevFlowY = curFlowY; } - prevFlowX = curFlowX; - prevFlowY = curFlowY; + flowx = curFlowX; + flowy = curFlowY; + + if (!stream) + streams[0].waitForCompletion(); } +} - flowx = curFlowX; - flowy = curFlowY; - - if (!S(s)) - streams[0].waitForCompletion(); +Ptr cv::cuda::FarnebackOpticalFlow::create(int numLevels, double pyrScale, bool fastPyramids, int winSize, + int numIters, int polyN, double polySigma, int flags) +{ + return makePtr(numLevels, pyrScale, fastPyramids, winSize, + numIters, polyN, polySigma, flags); } #endif diff --git a/modules/cudaoptflow/src/pyrlk.cpp b/modules/cudaoptflow/src/pyrlk.cpp index 52ee91f2fe..f4182743c0 100644 --- a/modules/cudaoptflow/src/pyrlk.cpp +++ b/modules/cudaoptflow/src/pyrlk.cpp @@ -47,37 +47,54 @@ using namespace cv::cuda; #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) -cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); } -void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); } -void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); } -void cv::cuda::PyrLKOpticalFlow::releaseMemory() {} +Ptr cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr(); } + +Ptr cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr(); } #else /* !defined (HAVE_CUDA) */ namespace pyrlk { - void loadConstants(int2 winSize, int iters); + void loadConstants(int2 winSize, int iters, cudaStream_t stream); void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream = 0); + int level, dim3 block, dim3 patch, cudaStream_t stream); void sparse4(PtrStepSz I, PtrStepSz J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream = 0); + int level, dim3 block, dim3 patch, cudaStream_t stream); void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, - PtrStepSzf err, int2 winSize, cudaStream_t stream = 0); -} - -cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() -{ - winSize = Size(21, 21); - maxLevel = 3; - iters = 30; - useInitialFlow = false; + PtrStepSzf err, int2 winSize, cudaStream_t stream); } namespace { - void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch) + class PyrLKOpticalFlowBase + { + public: + PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow); + + void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, + GpuMat& status, GpuMat* err, Stream& stream); + + void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream); + + protected: + Size winSize_; + int maxLevel_; + int iters_; + bool useInitialFlow_; + + private: + std::vector prevPyr_; + std::vector nextPyr_; + }; + + PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) : + winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow) + { + } + + void calcPatchSize(Size winSize, dim3& block, dim3& patch) { if (winSize.width > 32 && winSize.width > 2 * winSize.height) { @@ -95,156 +112,239 @@ namespace block.z = patch.z = 1; } -} -void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err) -{ - if (prevPts.empty()) + void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream) { - nextPts.release(); - status.release(); - if (err) err->release(); - return; - } - - dim3 block, patch; - calcPatchSize(winSize, block, patch); - - CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4); - CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); - CV_Assert(maxLevel >= 0); - CV_Assert(winSize.width > 2 && winSize.height > 2); - CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); - CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2); - - if (useInitialFlow) - CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2); - else - ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); - - GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); - GpuMat temp2 = nextPts.reshape(1); - cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2); - - ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); - status.setTo(Scalar::all(1)); - - if (err) - ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); - - // build the image pyramids. - - prevPyr_.resize(maxLevel + 1); - nextPyr_.resize(maxLevel + 1); - - int cn = prevImg.channels(); - - if (cn == 1 || cn == 4) - { - prevImg.convertTo(prevPyr_[0], CV_32F); - nextImg.convertTo(nextPyr_[0], CV_32F); - } - else - { - cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA); - buf_.convertTo(prevPyr_[0], CV_32F); - - cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA); - buf_.convertTo(nextPyr_[0], CV_32F); - } - - for (int level = 1; level <= maxLevel; ++level) - { - cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]); - cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]); - } - - pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters); - - for (int level = maxLevel; level >= 0; level--) - { - if (cn == 1) + if (prevPts.empty()) { - pyrlk::sparse1(prevPyr_[level], nextPyr_[level], - prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, - level, block, patch); + nextPts.release(); + status.release(); + if (err) err->release(); + return; + } + + dim3 block, patch; + calcPatchSize(winSize_, block, patch); + + CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 ); + CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() ); + CV_Assert( maxLevel_ >= 0 ); + CV_Assert( winSize_.width > 2 && winSize_.height > 2 ); + CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 ); + CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 ); + + if (useInitialFlow_) + CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() ); + else + ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); + + GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1); + GpuMat temp2 = nextPts.reshape(1); + cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream); + + ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); + status.setTo(Scalar::all(1), stream); + + if (err) + ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); + + // build the image pyramids. + + BufferPool pool(stream); + + prevPyr_.resize(maxLevel_ + 1); + nextPyr_.resize(maxLevel_ + 1); + + int cn = prevImg.channels(); + + if (cn == 1 || cn == 4) + { + prevImg.convertTo(prevPyr_[0], CV_32F, stream); + nextImg.convertTo(nextPyr_[0], CV_32F, stream); } else { - pyrlk::sparse4(prevPyr_[level], nextPyr_[level], - prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, - level, block, patch); + GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4)); + + cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream); + buf.convertTo(prevPyr_[0], CV_32F, stream); + + cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream); + buf.convertTo(nextPyr_[0], CV_32F, stream); + } + + for (int level = 1; level <= maxLevel_; ++level) + { + cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream); + cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream); + } + + pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream)); + + for (int level = maxLevel_; level >= 0; level--) + { + if (cn == 1) + { + pyrlk::sparse1(prevPyr_[level], nextPyr_[level], + prevPts.ptr(), nextPts.ptr(), + status.ptr(), + level == 0 && err ? err->ptr() : 0, prevPts.cols, + level, block, patch, + StreamAccessor::getStream(stream)); + } + else + { + pyrlk::sparse4(prevPyr_[level], nextPyr_[level], + prevPts.ptr(), nextPts.ptr(), + status.ptr(), + level == 0 && err ? err->ptr() : 0, prevPts.cols, + level, block, patch, + StreamAccessor::getStream(stream)); + } } } -} -void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err) -{ - CV_Assert(prevImg.type() == CV_8UC1); - CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); - CV_Assert(maxLevel >= 0); - CV_Assert(winSize.width > 2 && winSize.height > 2); - - if (err) - err->create(prevImg.size(), CV_32FC1); - - // build the image pyramids. - - prevPyr_.resize(maxLevel + 1); - nextPyr_.resize(maxLevel + 1); - - prevPyr_[0] = prevImg; - nextImg.convertTo(nextPyr_[0], CV_32F); - - for (int level = 1; level <= maxLevel; ++level) + void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream) { - cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]); - cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]); + CV_Assert( prevImg.type() == CV_8UC1 ); + CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() ); + CV_Assert( maxLevel_ >= 0 ); + CV_Assert( winSize_.width > 2 && winSize_.height > 2 ); + + // build the image pyramids. + + prevPyr_.resize(maxLevel_ + 1); + nextPyr_.resize(maxLevel_ + 1); + + prevPyr_[0] = prevImg; + nextImg.convertTo(nextPyr_[0], CV_32F, stream); + + for (int level = 1; level <= maxLevel_; ++level) + { + cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream); + cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream); + } + + BufferPool pool(stream); + + GpuMat uPyr[] = { + pool.getBuffer(prevImg.size(), CV_32FC1), + pool.getBuffer(prevImg.size(), CV_32FC1), + }; + GpuMat vPyr[] = { + pool.getBuffer(prevImg.size(), CV_32FC1), + pool.getBuffer(prevImg.size(), CV_32FC1), + }; + + uPyr[0].setTo(Scalar::all(0), stream); + vPyr[0].setTo(Scalar::all(0), stream); + uPyr[1].setTo(Scalar::all(0), stream); + vPyr[1].setTo(Scalar::all(0), stream); + + int2 winSize2i = make_int2(winSize_.width, winSize_.height); + pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream)); + + int idx = 0; + + for (int level = maxLevel_; level >= 0; level--) + { + int idx2 = (idx + 1) & 1; + + pyrlk::dense(prevPyr_[level], nextPyr_[level], + uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2], + PtrStepSzf(), winSize2i, + StreamAccessor::getStream(stream)); + + if (level > 0) + idx = idx2; + } + + uPyr[idx].copyTo(u, stream); + vPyr[idx].copyTo(v, stream); } - ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); - ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); - ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); - ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); - uPyr_[0].setTo(Scalar::all(0)); - vPyr_[0].setTo(Scalar::all(0)); - uPyr_[1].setTo(Scalar::all(0)); - vPyr_[1].setTo(Scalar::all(0)); - - int2 winSize2i = make_int2(winSize.width, winSize.height); - pyrlk::loadConstants(winSize2i, iters); - - PtrStepSzf derr = err ? *err : PtrStepSzf(); - - int idx = 0; - - for (int level = maxLevel; level >= 0; level--) + class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase { - int idx2 = (idx + 1) & 1; + public: + SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) : + PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow) + { + } - pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2], - level == 0 ? derr : PtrStepSzf(), winSize2i); + virtual Size getWinSize() const { return winSize_; } + virtual void setWinSize(Size winSize) { winSize_ = winSize; } - if (level > 0) - idx = idx2; - } + virtual int getMaxLevel() const { return maxLevel_; } + virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; } - uPyr_[idx].copyTo(u); - vPyr_[idx].copyTo(v); + virtual int getNumIters() const { return iters_; } + virtual void setNumIters(int iters) { iters_ = iters; } + + virtual bool getUseInitialFlow() const { return useInitialFlow_; } + virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; } + + virtual void calc(InputArray _prevImg, InputArray _nextImg, + InputArray _prevPts, InputOutputArray _nextPts, + OutputArray _status, + OutputArray _err, + Stream& stream) + { + const GpuMat prevImg = _prevImg.getGpuMat(); + const GpuMat nextImg = _nextImg.getGpuMat(); + const GpuMat prevPts = _prevPts.getGpuMat(); + GpuMat& nextPts = _nextPts.getGpuMatRef(); + GpuMat& status = _status.getGpuMatRef(); + GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL; + + sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream); + } + }; + + class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase + { + public: + DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) : + PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow) + { + } + + virtual Size getWinSize() const { return winSize_; } + virtual void setWinSize(Size winSize) { winSize_ = winSize; } + + virtual int getMaxLevel() const { return maxLevel_; } + virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; } + + virtual int getNumIters() const { return iters_; } + virtual void setNumIters(int iters) { iters_ = iters; } + + virtual bool getUseInitialFlow() const { return useInitialFlow_; } + virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; } + + virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream) + { + const GpuMat prevImg = _prevImg.getGpuMat(); + const GpuMat nextImg = _nextImg.getGpuMat(); + + BufferPool pool(stream); + GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1); + GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1); + + dense(prevImg, nextImg, u, v, stream); + + GpuMat flows[] = {u, v}; + cuda::merge(flows, 2, _flow, stream); + } + }; } -void cv::cuda::PyrLKOpticalFlow::releaseMemory() +Ptr cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow) { - prevPyr_.clear(); - nextPyr_.clear(); + return makePtr(winSize, maxLevel, iters, useInitialFlow); +} - buf_.release(); - - uPyr_[0].release(); - vPyr_[0].release(); - - uPyr_[1].release(); - vPyr_[1].release(); +Ptr cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow) +{ + return makePtr(winSize, maxLevel, iters, useInitialFlow); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/cudaoptflow/src/tvl1flow.cpp b/modules/cudaoptflow/src/tvl1flow.cpp index b8dfea56f1..e2ef07b0d1 100644 --- a/modules/cudaoptflow/src/tvl1flow.cpp +++ b/modules/cudaoptflow/src/tvl1flow.cpp @@ -44,256 +44,338 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA() { throw_no_cuda(); } -void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); } -void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage() {} -void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); } +Ptr cv::cuda::OpticalFlowDual_TVL1::create(double, double, double, int, int, double, int, double, double, bool) { throw_no_cuda(); return Ptr(); } #else using namespace cv; using namespace cv::cuda; -cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA() -{ - tau = 0.25; - lambda = 0.15; - theta = 0.3; - nscales = 5; - warps = 5; - epsilon = 0.01; - iterations = 300; - scaleStep = 0.8; - gamma = 0.0; - useInitialFlow = false; -} - -void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy) -{ - CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); - CV_Assert( I0.size() == I1.size() ); - CV_Assert( I0.type() == I1.type() ); - CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); - CV_Assert( nscales > 0 ); - - // allocate memory for the pyramid structure - I0s.resize(nscales); - I1s.resize(nscales); - u1s.resize(nscales); - u2s.resize(nscales); - u3s.resize(nscales); - - I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0); - I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0); - - if (!useInitialFlow) - { - flowx.create(I0.size(), CV_32FC1); - flowy.create(I0.size(), CV_32FC1); - } - - u1s[0] = flowx; - u2s[0] = flowy; - if (gamma) - u3s[0].create(I0.size(), CV_32FC1); - - I1x_buf.create(I0.size(), CV_32FC1); - I1y_buf.create(I0.size(), CV_32FC1); - - I1w_buf.create(I0.size(), CV_32FC1); - I1wx_buf.create(I0.size(), CV_32FC1); - I1wy_buf.create(I0.size(), CV_32FC1); - - grad_buf.create(I0.size(), CV_32FC1); - rho_c_buf.create(I0.size(), CV_32FC1); - - p11_buf.create(I0.size(), CV_32FC1); - p12_buf.create(I0.size(), CV_32FC1); - p21_buf.create(I0.size(), CV_32FC1); - p22_buf.create(I0.size(), CV_32FC1); - if (gamma) - { - p31_buf.create(I0.size(), CV_32FC1); - p32_buf.create(I0.size(), CV_32FC1); - } - diff_buf.create(I0.size(), CV_32FC1); - - // create the scales - for (int s = 1; s < nscales; ++s) - { - cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep); - cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep); - - if (I0s[s].cols < 16 || I0s[s].rows < 16) - { - nscales = s; - break; - } - - if (useInitialFlow) - { - cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep); - cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep); - - cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]); - cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]); - } - else - { - u1s[s].create(I0s[s].size(), CV_32FC1); - u2s[s].create(I0s[s].size(), CV_32FC1); - } - if (gamma) - u3s[s].create(I0s[s].size(), CV_32FC1); - } - - if (!useInitialFlow) - { - u1s[nscales-1].setTo(Scalar::all(0)); - u2s[nscales-1].setTo(Scalar::all(0)); - } - if (gamma) - u3s[nscales - 1].setTo(Scalar::all(0)); - - // pyramidal structure for computing the optical flow - for (int s = nscales - 1; s >= 0; --s) - { - // compute the optical flow at the current scale - procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]); - - // if this was the last scale, finish now - if (s == 0) - break; - - // otherwise, upsample the optical flow - - // zoom the optical flow for the next finer scale - cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size()); - cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size()); - if (gamma) - cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size()); - - // scale the optical flow with the appropriate zoom factor - cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]); - cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]); - } -} - namespace tvl1flow { - void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy); - void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho); + void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream); + void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, + PtrStepSzf u1, PtrStepSzf u2, + PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, + PtrStepSzf grad, PtrStepSzf rho, + cudaStream_t stream); void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho_c, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error, - float l_t, float theta, float gamma, bool calcError); - void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, const float gamma); + float l_t, float theta, float gamma, bool calcError, + cudaStream_t stream); + void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, + PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, + float taut, float gamma, + cudaStream_t stream); } -void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3) +namespace { - using namespace tvl1flow; - - const double scaledEpsilon = epsilon * epsilon * I0.size().area(); - - CV_DbgAssert( I1.size() == I0.size() ); - CV_DbgAssert( I1.type() == I0.type() ); - CV_DbgAssert( u1.size() == I0.size() ); - CV_DbgAssert( u2.size() == u1.size() ); - - GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); - centeredGradient(I1, I1x, I1y); - - GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); - - GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); - - GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); - GpuMat p31, p32; - if (gamma) + class OpticalFlowDual_TVL1_Impl : public OpticalFlowDual_TVL1 { - p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows)); - p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows)); - } - p11.setTo(Scalar::all(0)); - p12.setTo(Scalar::all(0)); - p21.setTo(Scalar::all(0)); - p22.setTo(Scalar::all(0)); - if (gamma) - { - p31.setTo(Scalar::all(0)); - p32.setTo(Scalar::all(0)); - } - - GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); - - const float l_t = static_cast(lambda * theta); - const float taut = static_cast(tau / theta); - - for (int warpings = 0; warpings < warps; ++warpings) - { - warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); - - double error = std::numeric_limits::max(); - double prevError = 0.0; - for (int n = 0; error > scaledEpsilon && n < iterations; ++n) + public: + OpticalFlowDual_TVL1_Impl(double tau, double lambda, double theta, int nscales, int warps, double epsilon, + int iterations, double scaleStep, double gamma, bool useInitialFlow) : + tau_(tau), lambda_(lambda), gamma_(gamma), theta_(theta), nscales_(nscales), warps_(warps), + epsilon_(epsilon), iterations_(iterations), scaleStep_(scaleStep), useInitialFlow_(useInitialFlow) { - // some tweaks to make sum operation less frequently - bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon); - estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast(theta), gamma, calcError); - if (calcError) + } + + virtual double getTau() const { return tau_; } + virtual void setTau(double tau) { tau_ = tau; } + + virtual double getLambda() const { return lambda_; } + virtual void setLambda(double lambda) { lambda_ = lambda; } + + virtual double getGamma() const { return gamma_; } + virtual void setGamma(double gamma) { gamma_ = gamma; } + + virtual double getTheta() const { return theta_; } + virtual void setTheta(double theta) { theta_ = theta; } + + virtual int getNumScales() const { return nscales_; } + virtual void setNumScales(int nscales) { nscales_ = nscales; } + + virtual int getNumWarps() const { return warps_; } + virtual void setNumWarps(int warps) { warps_ = warps; } + + virtual double getEpsilon() const { return epsilon_; } + virtual void setEpsilon(double epsilon) { epsilon_ = epsilon; } + + virtual int getNumIterations() const { return iterations_; } + virtual void setNumIterations(int iterations) { iterations_ = iterations; } + + virtual double getScaleStep() const { return scaleStep_; } + virtual void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; } + + virtual bool getUseInitialFlow() const { return useInitialFlow_; } + virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; } + + virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream); + + private: + double tau_; + double lambda_; + double gamma_; + double theta_; + int nscales_; + int warps_; + double epsilon_; + int iterations_; + double scaleStep_; + bool useInitialFlow_; + + private: + void calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream); + void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& stream); + + std::vector I0s; + std::vector I1s; + std::vector u1s; + std::vector u2s; + std::vector u3s; + + GpuMat I1x_buf; + GpuMat I1y_buf; + + GpuMat I1w_buf; + GpuMat I1wx_buf; + GpuMat I1wy_buf; + + GpuMat grad_buf; + GpuMat rho_c_buf; + + GpuMat p11_buf; + GpuMat p12_buf; + GpuMat p21_buf; + GpuMat p22_buf; + GpuMat p31_buf; + GpuMat p32_buf; + + GpuMat diff_buf; + GpuMat norm_buf; + }; + + void OpticalFlowDual_TVL1_Impl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream) + { + const GpuMat frame0 = _frame0.getGpuMat(); + const GpuMat frame1 = _frame1.getGpuMat(); + + BufferPool pool(stream); + GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1); + GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1); + + calcImpl(frame0, frame1, flowx, flowy, stream); + + GpuMat flows[] = {flowx, flowy}; + cuda::merge(flows, 2, _flow, stream); + } + + void OpticalFlowDual_TVL1_Impl::calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream) + { + CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); + CV_Assert( I0.size() == I1.size() ); + CV_Assert( I0.type() == I1.type() ); + CV_Assert( !useInitialFlow_ || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); + CV_Assert( nscales_ > 0 ); + + // allocate memory for the pyramid structure + I0s.resize(nscales_); + I1s.resize(nscales_); + u1s.resize(nscales_); + u2s.resize(nscales_); + u3s.resize(nscales_); + + I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0, stream); + I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0, stream); + + if (!useInitialFlow_) + { + flowx.create(I0.size(), CV_32FC1); + flowy.create(I0.size(), CV_32FC1); + } + + u1s[0] = flowx; + u2s[0] = flowy; + if (gamma_) + { + u3s[0].create(I0.size(), CV_32FC1); + } + + I1x_buf.create(I0.size(), CV_32FC1); + I1y_buf.create(I0.size(), CV_32FC1); + + I1w_buf.create(I0.size(), CV_32FC1); + I1wx_buf.create(I0.size(), CV_32FC1); + I1wy_buf.create(I0.size(), CV_32FC1); + + grad_buf.create(I0.size(), CV_32FC1); + rho_c_buf.create(I0.size(), CV_32FC1); + + p11_buf.create(I0.size(), CV_32FC1); + p12_buf.create(I0.size(), CV_32FC1); + p21_buf.create(I0.size(), CV_32FC1); + p22_buf.create(I0.size(), CV_32FC1); + if (gamma_) + { + p31_buf.create(I0.size(), CV_32FC1); + p32_buf.create(I0.size(), CV_32FC1); + } + diff_buf.create(I0.size(), CV_32FC1); + + // create the scales + for (int s = 1; s < nscales_; ++s) + { + cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream); + cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream); + + if (I0s[s].cols < 16 || I0s[s].rows < 16) { - error = cuda::sum(diff, norm_buf)[0]; - prevError = error; + nscales_ = s; + break; + } + + if (useInitialFlow_) + { + cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream); + cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream); + + cuda::multiply(u1s[s], Scalar::all(scaleStep_), u1s[s], 1, -1, stream); + cuda::multiply(u2s[s], Scalar::all(scaleStep_), u2s[s], 1, -1, stream); } else { - error = std::numeric_limits::max(); - prevError -= scaledEpsilon; + u1s[s].create(I0s[s].size(), CV_32FC1); + u2s[s].create(I0s[s].size(), CV_32FC1); + } + if (gamma_) + { + u3s[s].create(I0s[s].size(), CV_32FC1); + } + } + + if (!useInitialFlow_) + { + u1s[nscales_-1].setTo(Scalar::all(0), stream); + u2s[nscales_-1].setTo(Scalar::all(0), stream); + } + if (gamma_) + { + u3s[nscales_ - 1].setTo(Scalar::all(0), stream); + } + + // pyramidal structure for computing the optical flow + for (int s = nscales_ - 1; s >= 0; --s) + { + // compute the optical flow at the current scale + procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s], stream); + + // if this was the last scale, finish now + if (s == 0) + break; + + // otherwise, upsample the optical flow + + // zoom the optical flow for the next finer scale + cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream); + cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream); + if (gamma_) + { + cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream); } - estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma); + // scale the optical flow with the appropriate zoom factor + cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep_), u1s[s - 1], 1, -1, stream); + cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep_), u2s[s - 1], 1, -1, stream); + } + } + + void OpticalFlowDual_TVL1_Impl::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& _stream) + { + using namespace tvl1flow; + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const double scaledEpsilon = epsilon_ * epsilon_ * I0.size().area(); + + CV_DbgAssert( I1.size() == I0.size() ); + CV_DbgAssert( I1.type() == I0.type() ); + CV_DbgAssert( u1.size() == I0.size() ); + CV_DbgAssert( u2.size() == u1.size() ); + + GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); + centeredGradient(I1, I1x, I1y, stream); + + GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); + + GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); + + GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p31, p32; + if (gamma_) + { + p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows)); + p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows)); + } + p11.setTo(Scalar::all(0), _stream); + p12.setTo(Scalar::all(0), _stream); + p21.setTo(Scalar::all(0), _stream); + p22.setTo(Scalar::all(0), _stream); + if (gamma_) + { + p31.setTo(Scalar::all(0), _stream); + p32.setTo(Scalar::all(0), _stream); + } + + GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); + + const float l_t = static_cast(lambda_ * theta_); + const float taut = static_cast(tau_ / theta_); + + for (int warpings = 0; warpings < warps_; ++warpings) + { + warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c, stream); + + double error = std::numeric_limits::max(); + double prevError = 0.0; + for (int n = 0; error > scaledEpsilon && n < iterations_; ++n) + { + // some tweaks to make sum operation less frequently + bool calcError = (epsilon_ > 0) && (n & 0x1) && (prevError < scaledEpsilon); + estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast(theta_), gamma_, calcError, stream); + if (calcError) + { + _stream.waitForCompletion(); + error = cuda::sum(diff, norm_buf)[0]; + prevError = error; + } + else + { + error = std::numeric_limits::max(); + prevError -= scaledEpsilon; + } + + estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma_, stream); + } } } } -void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage() +Ptr cv::cuda::OpticalFlowDual_TVL1::create( + double tau, double lambda, double theta, int nscales, int warps, + double epsilon, int iterations, double scaleStep, double gamma, bool useInitialFlow) { - I0s.clear(); - I1s.clear(); - u1s.clear(); - u2s.clear(); - u3s.clear(); - - I1x_buf.release(); - I1y_buf.release(); - - I1w_buf.release(); - I1wx_buf.release(); - I1wy_buf.release(); - - grad_buf.release(); - rho_c_buf.release(); - - p11_buf.release(); - p12_buf.release(); - p21_buf.release(); - p22_buf.release(); - if (gamma) - { - p31_buf.release(); - p32_buf.release(); - } - diff_buf.release(); - norm_buf.release(); + return makePtr(tau, lambda, theta, nscales, warps, + epsilon, iterations, scaleStep, gamma, useInitialFlow); } #endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)