refactor cudaoptflow public API:

* use opaque algorithm interfaces
* add stream support
This commit is contained in:
Vladislav Vinogradov 2014-12-31 15:36:15 +03:00
parent 19c6bbe7d9
commit 381216aa54
7 changed files with 1258 additions and 925 deletions

View File

@ -61,49 +61,94 @@ namespace cv { namespace cuda {
//! @addtogroup cudaoptflow
//! @{
/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
(@cite Brox2004). :
//
// Interface
//
/** @brief Base interface for dense optical flow algorithms.
*/
class CV_EXPORTS BroxOpticalFlow
class CV_EXPORTS DenseOpticalFlow : public Algorithm
{
public:
BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
{
}
/** @brief Calculates a dense optical flow.
//! Compute optical flow
//! frame0 - source frame (supports only CV_32FC1 type)
//! frame1 - frame to track (with the same size and type as frame0)
//! u - flow horizontal component (along x axis)
//! v - flow vertical component (along y axis)
void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
//! flow smoothness
float alpha;
//! gradient constancy importance
float gamma;
//! pyramid scale factor
float scale_factor;
//! number of lagged non-linearity iterations (inner loop)
int inner_iterations;
//! number of warping iterations (number of pyramid levels)
int outer_iterations;
//! number of linear system solver iterations
int solver_iterations;
GpuMat buf;
@param I0 first input image.
@param I1 second input image of the same size and the same type as I0.
@param flow computed flow image that has the same size as I0 and type CV_32FC2.
@param stream Stream for the asynchronous version.
*/
virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0;
};
/** @brief Class used for calculating an optical flow.
/** @brief Base interface for sparse optical flow algorithms.
*/
class CV_EXPORTS SparseOpticalFlow : public Algorithm
{
public:
/** @brief Calculates a sparse optical flow.
The class can calculate an optical flow for a sparse feature set or dense optical flow using the
@param prevImg First input image.
@param nextImg Second input image of the same size and the same type as prevImg.
@param prevPts Vector of 2D points for which the flow needs to be found.
@param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
@param status Output status vector. Each element of the vector is set to 1 if the
flow for the corresponding features has been found. Otherwise, it is set to 0.
@param err Optional output vector that contains error response for each point (inverse confidence).
@param stream Stream for the asynchronous version.
*/
virtual void calc(InputArray prevImg, InputArray nextImg,
InputArray prevPts, InputOutputArray nextPts,
OutputArray status,
OutputArray err = cv::noArray(),
Stream& stream = Stream::Null()) = 0;
};
//
// BroxOpticalFlow
//
/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004).
*/
class CV_EXPORTS BroxOpticalFlow : public DenseOpticalFlow
{
public:
virtual double getFlowSmoothness() const = 0;
virtual void setFlowSmoothness(double alpha) = 0;
virtual double getGradientConstancyImportance() const = 0;
virtual void setGradientConstancyImportance(double gamma) = 0;
virtual double getPyramidScaleFactor() const = 0;
virtual void setPyramidScaleFactor(double scale_factor) = 0;
//! number of lagged non-linearity iterations (inner loop)
virtual int getInnerIterations() const = 0;
virtual void setInnerIterations(int inner_iterations) = 0;
//! number of warping iterations (number of pyramid levels)
virtual int getOuterIterations() const = 0;
virtual void setOuterIterations(int outer_iterations) = 0;
//! number of linear system solver iterations
virtual int getSolverIterations() const = 0;
virtual void setSolverIterations(int solver_iterations) = 0;
static Ptr<BroxOpticalFlow> create(
double alpha = 0.197,
double gamma = 50.0,
double scale_factor = 0.8,
int inner_iterations = 5,
int outer_iterations = 150,
int solver_iterations = 10);
};
//
// PyrLKOpticalFlow
//
/** @brief Class used for calculating a sparse optical flow.
The class can calculate an optical flow for a sparse feature set using the
iterative Lucas-Kanade method with pyramids.
@sa calcOpticalFlowPyrLK
@ -112,158 +157,116 @@ iterative Lucas-Kanade method with pyramids.
- An example of the Lucas Kanade optical flow algorithm can be found at
opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp
*/
class CV_EXPORTS PyrLKOpticalFlow
class CV_EXPORTS SparsePyrLKOpticalFlow : public SparseOpticalFlow
{
public:
PyrLKOpticalFlow();
virtual Size getWinSize() const = 0;
virtual void setWinSize(Size winSize) = 0;
/** @brief Calculate an optical flow for a sparse feature set.
virtual int getMaxLevel() const = 0;
virtual void setMaxLevel(int maxLevel) = 0;
@param prevImg First 8-bit input image (supports both grayscale and color images).
@param nextImg Second input image of the same size and the same type as prevImg .
@param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
with CV_32FC2 type.
@param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
containing the calculated new positions of input features in the second image. When useInitialFlow
is true, the vector must have the same size as in the input.
@param status Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the
flow for the corresponding features has been found. Otherwise, it is set to 0.
@param err Output vector (CV_32FC1 type) that contains the difference between patches around the
original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
needed.
virtual int getNumIters() const = 0;
virtual void setNumIters(int iters) = 0;
@sa calcOpticalFlowPyrLK
*/
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err = 0);
virtual bool getUseInitialFlow() const = 0;
virtual void setUseInitialFlow(bool useInitialFlow) = 0;
/** @brief Calculate dense optical flow.
@param prevImg First 8-bit grayscale input image.
@param nextImg Second input image of the same size and the same type as prevImg .
@param u Horizontal component of the optical flow of the same size as input images, 32-bit
floating-point, single-channel
@param v Vertical component of the optical flow of the same size as input images, 32-bit
floating-point, single-channel
@param err Output vector (CV_32FC1 type) that contains the difference between patches around the
original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
needed.
*/
void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
/** @brief Releases inner buffers memory.
*/
void releaseMemory();
Size winSize;
int maxLevel;
int iters;
bool useInitialFlow;
private:
std::vector<GpuMat> prevPyr_;
std::vector<GpuMat> nextPyr_;
GpuMat buf_;
GpuMat uPyr_[2];
GpuMat vPyr_[2];
static Ptr<SparsePyrLKOpticalFlow> create(
Size winSize = Size(21, 21),
int maxLevel = 3,
int iters = 30,
bool useInitialFlow = false);
};
/** @brief Class computing a dense optical flow using the Gunnar Farnebacks algorithm. :
/** @brief Class used for calculating a dense optical flow.
The class can calculate an optical flow for a dense optical flow using the
iterative Lucas-Kanade method with pyramids.
*/
class CV_EXPORTS FarnebackOpticalFlow
class CV_EXPORTS DensePyrLKOpticalFlow : public DenseOpticalFlow
{
public:
FarnebackOpticalFlow()
{
numLevels = 5;
pyrScale = 0.5;
fastPyramids = false;
winSize = 13;
numIters = 10;
polyN = 5;
polySigma = 1.1;
flags = 0;
}
virtual Size getWinSize() const = 0;
virtual void setWinSize(Size winSize) = 0;
int numLevels;
double pyrScale;
bool fastPyramids;
int winSize;
int numIters;
int polyN;
double polySigma;
int flags;
virtual int getMaxLevel() const = 0;
virtual void setMaxLevel(int maxLevel) = 0;
/** @brief Computes a dense optical flow using the Gunnar Farnebacks algorithm.
virtual int getNumIters() const = 0;
virtual void setNumIters(int iters) = 0;
@param frame0 First 8-bit gray-scale input image
@param frame1 Second 8-bit gray-scale input image
@param flowx Flow horizontal component
@param flowy Flow vertical component
@param s Stream
virtual bool getUseInitialFlow() const = 0;
virtual void setUseInitialFlow(bool useInitialFlow) = 0;
@sa calcOpticalFlowFarneback
*/
void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
/** @brief Releases unused auxiliary memory buffers.
*/
void releaseMemory()
{
frames_[0].release();
frames_[1].release();
pyrLevel_[0].release();
pyrLevel_[1].release();
M_.release();
bufM_.release();
R_[0].release();
R_[1].release();
blurredFrame_[0].release();
blurredFrame_[1].release();
pyramid0_.clear();
pyramid1_.clear();
}
private:
void prepareGaussian(
int n, double sigma, float *g, float *xg, float *xxg,
double &ig11, double &ig03, double &ig33, double &ig55);
void setPolynomialExpansionConsts(int n, double sigma);
void updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
void updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
GpuMat frames_[2];
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
std::vector<GpuMat> pyramid0_, pyramid1_;
static Ptr<DensePyrLKOpticalFlow> create(
Size winSize = Size(13, 13),
int maxLevel = 3,
int iters = 30,
bool useInitialFlow = false);
};
// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
//
// see reference:
// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
class CV_EXPORTS OpticalFlowDual_TVL1_CUDA
// FarnebackOpticalFlow
//
/** @brief Class computing a dense optical flow using the Gunnar Farnebacks algorithm.
*/
class CV_EXPORTS FarnebackOpticalFlow : public DenseOpticalFlow
{
public:
OpticalFlowDual_TVL1_CUDA();
virtual int getNumLevels() const = 0;
virtual void setNumLevels(int numLevels) = 0;
void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
virtual double getPyrScale() const = 0;
virtual void setPyrScale(double pyrScale) = 0;
void collectGarbage();
virtual bool getFastPyramids() const = 0;
virtual void setFastPyramids(bool fastPyramids) = 0;
virtual int getWinSize() const = 0;
virtual void setWinSize(int winSize) = 0;
virtual int getNumIters() const = 0;
virtual void setNumIters(int numIters) = 0;
virtual int getPolyN() const = 0;
virtual void setPolyN(int polyN) = 0;
virtual double getPolySigma() const = 0;
virtual void setPolySigma(double polySigma) = 0;
virtual int getFlags() const = 0;
virtual void setFlags(int flags) = 0;
static Ptr<FarnebackOpticalFlow> create(
int numLevels = 5,
double pyrScale = 0.5,
bool fastPyramids = false,
int winSize = 13,
int numIters = 10,
int polyN = 5,
double polySigma = 1.1,
int flags = 0);
};
//
// OpticalFlowDual_TVL1
//
/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
*
* @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
* @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
*/
class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow
{
public:
/**
* Time step of the numerical scheme.
*/
double tau;
virtual double getTau() const = 0;
virtual void setTau(double tau) = 0;
/**
* Weight parameter for the data term, attachment parameter.
@ -271,7 +274,8 @@ public:
* The smaller this parameter is, the smoother the solutions we obtain.
* It depends on the range of motions of the images, so its value should be adapted to each image sequence.
*/
double lambda;
virtual double getLambda() const = 0;
virtual void setLambda(double lambda) = 0;
/**
* Weight parameter for (u - v)^2, tightness parameter.
@ -279,20 +283,23 @@ public:
* In theory, it should have a small value in order to maintain both parts in correspondence.
* The method is stable for a large range of values of this parameter.
*/
virtual double getGamma() const = 0;
virtual void setGamma(double gamma) = 0;
double gamma;
/**
* parameter used for motion estimation. It adds a variable allowing for illumination variations
* Set this parameter to 1. if you have varying illumination.
* See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
* Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
*/
double theta;
* parameter used for motion estimation. It adds a variable allowing for illumination variations
* Set this parameter to 1. if you have varying illumination.
* See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
* Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
*/
virtual double getTheta() const = 0;
virtual void setTheta(double theta) = 0;
/**
* Number of scales used to create the pyramid of images.
*/
int nscales;
virtual int getNumScales() const = 0;
virtual void setNumScales(int nscales) = 0;
/**
* Number of warpings per scale.
@ -300,51 +307,39 @@ public:
* This is a parameter that assures the stability of the method.
* It also affects the running time, so it is a compromise between speed and accuracy.
*/
int warps;
virtual int getNumWarps() const = 0;
virtual void setNumWarps(int warps) = 0;
/**
* Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
* A small value will yield more accurate solutions at the expense of a slower convergence.
*/
double epsilon;
virtual double getEpsilon() const = 0;
virtual void setEpsilon(double epsilon) = 0;
/**
* Stopping criterion iterations number used in the numerical scheme.
*/
int iterations;
virtual int getNumIterations() const = 0;
virtual void setNumIterations(int iterations) = 0;
double scaleStep;
virtual double getScaleStep() const = 0;
virtual void setScaleStep(double scaleStep) = 0;
bool useInitialFlow;
virtual bool getUseInitialFlow() const = 0;
virtual void setUseInitialFlow(bool useInitialFlow) = 0;
private:
void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3);
std::vector<GpuMat> I0s;
std::vector<GpuMat> I1s;
std::vector<GpuMat> u1s;
std::vector<GpuMat> u2s;
std::vector<GpuMat> u3s;
GpuMat I1x_buf;
GpuMat I1y_buf;
GpuMat I1w_buf;
GpuMat I1wx_buf;
GpuMat I1wy_buf;
GpuMat grad_buf;
GpuMat rho_c_buf;
GpuMat p11_buf;
GpuMat p12_buf;
GpuMat p21_buf;
GpuMat p22_buf;
GpuMat p31_buf;
GpuMat p32_buf;
GpuMat diff_buf;
GpuMat norm_buf;
static Ptr<OpticalFlowDual_TVL1> create(
double tau = 0.25,
double lambda = 0.15,
double theta = 0.3,
int nscales = 5,
int warps = 5,
double epsilon = 0.01,
int iterations = 300,
double scaleStep = 0.8,
double gamma = 0.0,
bool useInitialFlow = false);
};
//! @}

View File

@ -47,84 +47,148 @@ using namespace cv::cuda;
#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }
#else
namespace
{
size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
namespace {
class BroxOpticalFlowImpl : public BroxOpticalFlow
{
NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
public:
BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
int inner_iterations, int outer_iterations, int solver_iterations) :
alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
solver_iterations_(solver_iterations)
{
}
virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
virtual double getFlowSmoothness() const { return alpha_; }
virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
virtual double getGradientConstancyImportance() const { return gamma_; }
virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
virtual double getPyramidScaleFactor() const { return scale_factor_; }
virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
//! number of lagged non-linearity iterations (inner loop)
virtual int getInnerIterations() const { return inner_iterations_; }
virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
//! number of warping iterations (number of pyramid levels)
virtual int getOuterIterations() const { return outer_iterations_; }
virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
//! number of linear system solver iterations
virtual int getSolverIterations() const { return solver_iterations_; }
virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
private:
//! flow smoothness
float alpha_;
//! gradient constancy importance
float gamma_;
//! pyramid scale factor
float scale_factor_;
//! number of lagged non-linearity iterations (inner loop)
int inner_iterations_;
//! number of warping iterations (number of pyramid levels)
int outer_iterations_;
//! number of linear system solver iterations
int solver_iterations_;
};
static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
size_t textureAlignment)
{
NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));
ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
return gpuCounter.maxSize();
}
static void outputHandler(const String &msg)
{
CV_Error(cv::Error::GpuApiCallError, msg.c_str());
}
void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
{
const GpuMat frame0 = _I0.getGpuMat();
const GpuMat frame1 = _I1.getGpuMat();
CV_Assert( frame0.type() == CV_32FC1 );
CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );
ncvSetDebugOutputHandler(outputHandler);
BufferPool pool(stream);
GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);
NCVBroxOpticalFlowDescriptor desc;
desc.alpha = alpha_;
desc.gamma = gamma_;
desc.scale_factor = scale_factor_;
desc.number_of_inner_iterations = inner_iterations_;
desc.number_of_outer_iterations = outer_iterations_;
desc.number_of_solver_iterations = solver_iterations_;
NCVMemSegment frame0MemSeg;
frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
frame0MemSeg.size = frame0.step * frame0.rows;
NCVMemSegment frame1MemSeg;
frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
frame1MemSeg.size = frame1.step * frame1.rows;
NCVMemSegment uMemSeg;
uMemSeg.begin.memtype = NCVMemoryTypeDevice;
uMemSeg.begin.ptr = u.ptr();
uMemSeg.size = u.step * u.rows;
NCVMemSegment vMemSeg;
vMemSeg.begin.memtype = NCVMemoryTypeDevice;
vMemSeg.begin.ptr = v.ptr();
vMemSeg.size = v.step * v.rows;
DeviceInfo devInfo;
size_t textureAlignment = devInfo.textureAlignment();
NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());
ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );
GpuMat flows[] = {u, v};
cuda::merge(flows, 2, _flow, stream);
}
}
namespace
Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
{
static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
}
void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
{
ncvSetDebugOutputHandler(outputHandler);
CV_Assert(frame0.type() == CV_32FC1);
CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
u.create(frame0.size(), CV_32FC1);
v.create(frame0.size(), CV_32FC1);
cudaDeviceProp devProp;
cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
NCVBroxOpticalFlowDescriptor desc;
desc.alpha = alpha;
desc.gamma = gamma;
desc.scale_factor = scale_factor;
desc.number_of_inner_iterations = inner_iterations;
desc.number_of_outer_iterations = outer_iterations;
desc.number_of_solver_iterations = solver_iterations;
NCVMemSegment frame0MemSeg;
frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
frame0MemSeg.size = frame0.step * frame0.rows;
NCVMemSegment frame1MemSeg;
frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
frame1MemSeg.size = frame1.step * frame1.rows;
NCVMemSegment uMemSeg;
uMemSeg.begin.memtype = NCVMemoryTypeDevice;
uMemSeg.begin.ptr = u.ptr();
uMemSeg.size = u.step * u.rows;
NCVMemSegment vMemSeg;
vMemSeg.begin.memtype = NCVMemoryTypeDevice;
vMemSeg.begin.ptr = v.ptr();
vMemSeg.size = v.step * v.rows;
NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
cudaStream_t stream = StreamAccessor::getStream(s);
size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
}
#endif /* HAVE_CUDA */

View File

@ -472,16 +472,16 @@ namespace pyrlk
}
}
void loadConstants(int2 winSize, int iters)
void loadConstants(int2 winSize, int iters, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
}
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,

View File

@ -66,15 +66,16 @@ namespace tvl1flow
dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
}
void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
centeredGradientKernel<<<grid, block>>>(src, dx, dy);
centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (!stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
@ -164,7 +165,10 @@ namespace tvl1flow
rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
}
void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@ -173,10 +177,11 @@ namespace tvl1flow
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (!stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
@ -292,15 +297,17 @@ namespace tvl1flow
PtrStepSzf grad, PtrStepSzf rho_c,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
float l_t, float theta, float gamma, bool calcError)
float l_t, float theta, float gamma, bool calcError,
cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (!stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
@ -346,15 +353,19 @@ namespace tvl1flow
}
}
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
float taut, float gamma,
cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
if (!stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
}

View File

@ -42,23 +42,21 @@
#include "precomp.hpp"
#define MIN_SIZE 32
#define S(x) StreamAccessor::getStream(x)
// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
// leads to an inefficient code. It's for debug purposes only.
#define ENABLE_CUDA_RESIZE 1
using namespace cv;
using namespace cv::cuda;
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::cuda::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int, double, bool, int, int, int, double, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }
#else
#define MIN_SIZE 32
// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
// leads to an inefficient code. It's for debug purposes only.
#define ENABLE_CUDA_RESIZE 1
namespace cv { namespace cuda { namespace device { namespace optflow_farneback
{
void setPolynomialExpansionConsts(
@ -76,8 +74,6 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
void updateFlowGpu(
const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
/*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
@ -93,10 +89,93 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback
void gaussianBlur5Gpu_CC11(
const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
}}}}
namespace
{
class FarnebackOpticalFlowImpl : public FarnebackOpticalFlow
{
public:
FarnebackOpticalFlowImpl(int numLevels, double pyrScale, bool fastPyramids, int winSize,
int numIters, int polyN, double polySigma, int flags) :
numLevels_(numLevels), pyrScale_(pyrScale), fastPyramids_(fastPyramids), winSize_(winSize),
numIters_(numIters), polyN_(polyN), polySigma_(polySigma), flags_(flags)
{
}
virtual int getNumLevels() const { return numLevels_; }
virtual void setNumLevels(int numLevels) { numLevels_ = numLevels; }
virtual double getPyrScale() const { return pyrScale_; }
virtual void setPyrScale(double pyrScale) { pyrScale_ = pyrScale; }
virtual bool getFastPyramids() const { return fastPyramids_; }
virtual void setFastPyramids(bool fastPyramids) { fastPyramids_ = fastPyramids; }
virtual int getWinSize() const { return winSize_; }
virtual void setWinSize(int winSize) { winSize_ = winSize; }
virtual int getNumIters() const { return numIters_; }
virtual void setNumIters(int numIters) { numIters_ = numIters; }
virtual int getPolyN() const { return polyN_; }
virtual void setPolyN(int polyN) { polyN_ = polyN; }
virtual double getPolySigma() const { return polySigma_; }
virtual void setPolySigma(double polySigma) { polySigma_ = polySigma; }
virtual int getFlags() const { return flags_; }
virtual void setFlags(int flags) { flags_ = flags; }
virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
private:
int numLevels_;
double pyrScale_;
bool fastPyramids_;
int winSize_;
int numIters_;
int polyN_;
double polySigma_;
int flags_;
private:
void prepareGaussian(
int n, double sigma, float *g, float *xg, float *xxg,
double &ig11, double &ig03, double &ig33, double &ig55);
void setPolynomialExpansionConsts(int n, double sigma);
void updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
void updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
void calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream);
GpuMat frames_[2];
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
std::vector<GpuMat> pyramid0_, pyramid1_;
};
void FarnebackOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
{
const GpuMat frame0 = _frame0.getGpuMat();
const GpuMat frame1 = _frame1.getGpuMat();
BufferPool pool(stream);
GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
calcImpl(frame0, frame1, flowx, flowy, stream);
GpuMat flows[] = {flowx, flowy};
cuda::merge(flows, 2, _flow, stream);
}
GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
{
if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
@ -104,285 +183,287 @@ namespace
return mat = GpuMat(rows, cols, type);
}
}
void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
int n, double sigma, float *g, float *xg, float *xxg,
double &ig11, double &ig03, double &ig33, double &ig55)
{
double s = 0.;
for (int x = -n; x <= n; x++)
{
g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
s += g[x];
}
s = 1./s;
for (int x = -n; x <= n; x++)
{
g[x] = (float)(g[x]*s);
xg[x] = (float)(x*g[x]);
xxg[x] = (float)(x*x*g[x]);
}
Mat_<double> G(6, 6);
G.setTo(0);
for (int y = -n; y <= n; y++)
void FarnebackOpticalFlowImpl::prepareGaussian(
int n, double sigma, float *g, float *xg, float *xxg,
double &ig11, double &ig03, double &ig33, double &ig55)
{
double s = 0.;
for (int x = -n; x <= n; x++)
{
G(0,0) += g[y]*g[x];
G(1,1) += g[y]*g[x]*x*x;
G(3,3) += g[y]*g[x]*x*x*x*x;
G(5,5) += g[y]*g[x]*x*x*y*y;
}
}
//G[0][0] = 1.;
G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
G(4,4) = G(3,3);
G(3,4) = G(4,3) = G(5,5);
// invG:
// [ x e e ]
// [ y ]
// [ y ]
// [ e z ]
// [ e z ]
// [ u ]
Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
ig11 = invG(1,1);
ig03 = invG(0,3);
ig33 = invG(3,3);
ig55 = invG(5,5);
}
void cv::cuda::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
{
std::vector<float> buf(n*6 + 3);
float* g = &buf[0] + n;
float* xg = g + n*2 + 1;
float* xxg = xg + n*2 + 1;
if (sigma < FLT_EPSILON)
sigma = n*0.3;
double ig11, ig03, ig33, ig55;
prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
}
void cv::cuda::FarnebackOpticalFlow::updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
else
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
swap(M, bufM);
for (int i = 1; i < 5; ++i)
streams[i].waitForCompletion();
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
if (updateMatrices)
device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
}
void cv::cuda::FarnebackOpticalFlow::updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::gaussianBlur5Gpu(
M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
else
device::optflow_farneback::gaussianBlur5Gpu_CC11(
M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
swap(M, bufM);
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
if (updateMatrices)
device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
}
void cv::cuda::FarnebackOpticalFlow::operator ()(
const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
{
CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
CV_Assert(frame0.size() == frame1.size());
CV_Assert(polyN == 5 || polyN == 7);
CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
Stream streams[5];
if (S(s))
streams[0] = s;
Size size = frame0.size();
GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
flowx.create(size, CV_32F);
flowy.create(size, CV_32F);
GpuMat flowx0 = flowx;
GpuMat flowy0 = flowy;
// Crop unnecessary levels
double scale = 1;
int numLevelsCropped = 0;
for (; numLevelsCropped < numLevels; numLevelsCropped++)
{
scale *= pyrScale;
if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
break;
}
frame0.convertTo(frames_[0], CV_32F, streams[0]);
frame1.convertTo(frames_[1], CV_32F, streams[1]);
if (fastPyramids)
{
// Build Gaussian pyramids using pyrDown()
pyramid0_.resize(numLevelsCropped + 1);
pyramid1_.resize(numLevelsCropped + 1);
pyramid0_[0] = frames_[0];
pyramid1_[0] = frames_[1];
for (int i = 1; i <= numLevelsCropped; ++i)
{
cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
}
}
setPolynomialExpansionConsts(polyN, polySigma);
device::optflow_farneback::setUpdateMatricesConsts();
for (int k = numLevelsCropped; k >= 0; k--)
{
streams[0].waitForCompletion();
scale = 1;
for (int i = 0; i < k; i++)
scale *= pyrScale;
double sigma = (1./scale - 1) * 0.5;
int smoothSize = cvRound(sigma*5) | 1;
smoothSize = std::max(smoothSize, 3);
int width = cvRound(size.width*scale);
int height = cvRound(size.height*scale);
if (fastPyramids)
{
width = pyramid0_[k].cols;
height = pyramid0_[k].rows;
g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
s += g[x];
}
if (k > 0)
s = 1./s;
for (int x = -n; x <= n; x++)
{
curFlowX.create(height, width, CV_32F);
curFlowY.create(height, width, CV_32F);
}
else
{
curFlowX = flowx0;
curFlowY = flowy0;
g[x] = (float)(g[x]*s);
xg[x] = (float)(x*g[x]);
xxg[x] = (float)(x*x*g[x]);
}
if (!prevFlowX.data)
Mat_<double> G(6, 6);
G.setTo(0);
for (int y = -n; y <= n; y++)
{
if (flags & OPTFLOW_USE_INITIAL_FLOW)
for (int x = -n; x <= n; x++)
{
cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
G(0,0) += g[y]*g[x];
G(1,1) += g[y]*g[x]*x*x;
G(3,3) += g[y]*g[x]*x*x*x*x;
G(5,5) += g[y]*g[x]*x*x*y*y;
}
}
//G[0][0] = 1.;
G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
G(4,4) = G(3,3);
G(3,4) = G(4,3) = G(5,5);
// invG:
// [ x e e ]
// [ y ]
// [ y ]
// [ e z ]
// [ e z ]
// [ u ]
Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
ig11 = invG(1,1);
ig03 = invG(0,3);
ig33 = invG(3,3);
ig55 = invG(5,5);
}
void FarnebackOpticalFlowImpl::setPolynomialExpansionConsts(int n, double sigma)
{
std::vector<float> buf(n*6 + 3);
float* g = &buf[0] + n;
float* xg = g + n*2 + 1;
float* xxg = xg + n*2 + 1;
if (sigma < FLT_EPSILON)
sigma = n*0.3;
double ig11, ig03, ig33, ig55;
prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
}
void FarnebackOpticalFlowImpl::updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
else
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
swap(M, bufM);
for (int i = 1; i < 5; ++i)
streams[i].waitForCompletion();
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
if (updateMatrices)
device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
}
void FarnebackOpticalFlowImpl::updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{
if (deviceSupports(FEATURE_SET_COMPUTE_12))
device::optflow_farneback::gaussianBlur5Gpu(
M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
else
device::optflow_farneback::gaussianBlur5Gpu_CC11(
M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
swap(M, bufM);
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
if (updateMatrices)
device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
}
void FarnebackOpticalFlowImpl::calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream)
{
CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
CV_Assert(frame0.size() == frame1.size());
CV_Assert(polyN_ == 5 || polyN_ == 7);
CV_Assert(!fastPyramids_ || std::abs(pyrScale_ - 0.5) < 1e-6);
Stream streams[5];
if (stream)
streams[0] = stream;
Size size = frame0.size();
GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
flowx.create(size, CV_32F);
flowy.create(size, CV_32F);
GpuMat flowx0 = flowx;
GpuMat flowy0 = flowy;
// Crop unnecessary levels
double scale = 1;
int numLevelsCropped = 0;
for (; numLevelsCropped < numLevels_; numLevelsCropped++)
{
scale *= pyrScale_;
if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
break;
}
frame0.convertTo(frames_[0], CV_32F, streams[0]);
frame1.convertTo(frames_[1], CV_32F, streams[1]);
if (fastPyramids_)
{
// Build Gaussian pyramids using pyrDown()
pyramid0_.resize(numLevelsCropped + 1);
pyramid1_.resize(numLevelsCropped + 1);
pyramid0_[0] = frames_[0];
pyramid1_[0] = frames_[1];
for (int i = 1; i <= numLevelsCropped; ++i)
{
cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
}
}
setPolynomialExpansionConsts(polyN_, polySigma_);
device::optflow_farneback::setUpdateMatricesConsts();
for (int k = numLevelsCropped; k >= 0; k--)
{
streams[0].waitForCompletion();
scale = 1;
for (int i = 0; i < k; i++)
scale *= pyrScale_;
double sigma = (1./scale - 1) * 0.5;
int smoothSize = cvRound(sigma*5) | 1;
smoothSize = std::max(smoothSize, 3);
int width = cvRound(size.width*scale);
int height = cvRound(size.height*scale);
if (fastPyramids_)
{
width = pyramid0_[k].cols;
height = pyramid0_[k].rows;
}
if (k > 0)
{
curFlowX.create(height, width, CV_32F);
curFlowY.create(height, width, CV_32F);
}
else
{
curFlowX.setTo(0, streams[0]);
curFlowY.setTo(0, streams[1]);
curFlowX = flowx0;
curFlowY = flowy0;
}
}
else
{
cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
}
GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
GpuMat R[2] =
{
allocMatFromBuf(5*height, width, CV_32F, R_[0]),
allocMatFromBuf(5*height, width, CV_32F, R_[1])
};
if (fastPyramids)
{
device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
}
else
{
GpuMat blurredFrame[2] =
if (!prevFlowX.data)
{
allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
};
GpuMat pyrLevel[2] =
{
allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
};
Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
for (int i = 0; i < 2; i++)
{
device::optflow_farneback::gaussianBlurGpu(
frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
{
cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
}
else
{
curFlowX.setTo(0, streams[0]);
curFlowY.setTo(0, streams[1]);
}
}
}
streams[1].waitForCompletion();
device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
{
Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
}
for (int i = 0; i < numIters; i++)
{
if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
else
updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
{
cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale_, streams[0]);
curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale_, streams[1]);
}
GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
GpuMat R[2] =
{
allocMatFromBuf(5*height, width, CV_32F, R_[0]),
allocMatFromBuf(5*height, width, CV_32F, R_[1])
};
if (fastPyramids_)
{
device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN_, R[0], StreamAccessor::getStream(streams[0]));
device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN_, R[1], StreamAccessor::getStream(streams[1]));
}
else
{
GpuMat blurredFrame[2] =
{
allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
};
GpuMat pyrLevel[2] =
{
allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
};
Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
for (int i = 0; i < 2; i++)
{
device::optflow_farneback::gaussianBlurGpu(
frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, StreamAccessor::getStream(streams[i]));
cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN_, R[i], StreamAccessor::getStream(streams[i]));
}
}
streams[1].waitForCompletion();
device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, StreamAccessor::getStream(streams[0]));
if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
{
Mat g = getGaussianKernel(winSize_, winSize_/2*0.3f, CV_32F);
device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize_/2), winSize_/2);
}
for (int i = 0; i < numIters_; i++)
{
if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
else
updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
}
prevFlowX = curFlowX;
prevFlowY = curFlowY;
}
prevFlowX = curFlowX;
prevFlowY = curFlowY;
flowx = curFlowX;
flowy = curFlowY;
if (!stream)
streams[0].waitForCompletion();
}
}
flowx = curFlowX;
flowy = curFlowY;
if (!S(s))
streams[0].waitForCompletion();
Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int numLevels, double pyrScale, bool fastPyramids, int winSize,
int numIters, int polyN, double polySigma, int flags)
{
return makePtr<FarnebackOpticalFlowImpl>(numLevels, pyrScale, fastPyramids, winSize,
numIters, polyN, polySigma, flags);
}
#endif

View File

@ -47,37 +47,54 @@ using namespace cv::cuda;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
void cv::cuda::PyrLKOpticalFlow::releaseMemory() {}
Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
#else /* !defined (HAVE_CUDA) */
namespace pyrlk
{
void loadConstants(int2 winSize, int iters);
void loadConstants(int2 winSize, int iters, cudaStream_t stream);
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
int level, dim3 block, dim3 patch, cudaStream_t stream);
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
int level, dim3 block, dim3 patch, cudaStream_t stream);
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
}
cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow()
{
winSize = Size(21, 21);
maxLevel = 3;
iters = 30;
useInitialFlow = false;
PtrStepSzf err, int2 winSize, cudaStream_t stream);
}
namespace
{
void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
class PyrLKOpticalFlowBase
{
public:
PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow);
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream);
void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
protected:
Size winSize_;
int maxLevel_;
int iters_;
bool useInitialFlow_;
private:
std::vector<GpuMat> prevPyr_;
std::vector<GpuMat> nextPyr_;
};
PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow)
{
}
void calcPatchSize(Size winSize, dim3& block, dim3& patch)
{
if (winSize.width > 32 && winSize.width > 2 * winSize.height)
{
@ -95,156 +112,239 @@ namespace
block.z = patch.z = 1;
}
}
void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
{
if (prevPts.empty())
void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
{
nextPts.release();
status.release();
if (err) err->release();
return;
}
dim3 block, patch;
calcPatchSize(winSize, block, patch);
CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
CV_Assert(winSize.width > 2 && winSize.height > 2);
CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
if (useInitialFlow)
CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
else
ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
GpuMat temp2 = nextPts.reshape(1);
cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1));
if (err)
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids.
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
int cn = prevImg.channels();
if (cn == 1 || cn == 4)
{
prevImg.convertTo(prevPyr_[0], CV_32F);
nextImg.convertTo(nextPyr_[0], CV_32F);
}
else
{
cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
buf_.convertTo(prevPyr_[0], CV_32F);
cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
buf_.convertTo(nextPyr_[0], CV_32F);
}
for (int level = 1; level <= maxLevel; ++level)
{
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
}
pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
for (int level = maxLevel; level >= 0; level--)
{
if (cn == 1)
if (prevPts.empty())
{
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
nextPts.release();
status.release();
if (err) err->release();
return;
}
dim3 block, patch;
calcPatchSize(winSize_, block, patch);
CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
CV_Assert( maxLevel_ >= 0 );
CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
if (useInitialFlow_)
CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() );
else
ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1);
GpuMat temp2 = nextPts.reshape(1);
cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1), stream);
if (err)
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids.
BufferPool pool(stream);
prevPyr_.resize(maxLevel_ + 1);
nextPyr_.resize(maxLevel_ + 1);
int cn = prevImg.channels();
if (cn == 1 || cn == 4)
{
prevImg.convertTo(prevPyr_[0], CV_32F, stream);
nextImg.convertTo(nextPyr_[0], CV_32F, stream);
}
else
{
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));
cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
buf.convertTo(prevPyr_[0], CV_32F, stream);
cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
buf.convertTo(nextPyr_[0], CV_32F, stream);
}
for (int level = 1; level <= maxLevel_; ++level)
{
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
}
pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
for (int level = maxLevel_; level >= 0; level--)
{
if (cn == 1)
{
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch,
StreamAccessor::getStream(stream));
}
else
{
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch,
StreamAccessor::getStream(stream));
}
}
}
}
void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
{
CV_Assert(prevImg.type() == CV_8UC1);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
CV_Assert(winSize.width > 2 && winSize.height > 2);
if (err)
err->create(prevImg.size(), CV_32FC1);
// build the image pyramids.
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
prevPyr_[0] = prevImg;
nextImg.convertTo(nextPyr_[0], CV_32F);
for (int level = 1; level <= maxLevel; ++level)
void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
{
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
CV_Assert( prevImg.type() == CV_8UC1 );
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
CV_Assert( maxLevel_ >= 0 );
CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
// build the image pyramids.
prevPyr_.resize(maxLevel_ + 1);
nextPyr_.resize(maxLevel_ + 1);
prevPyr_[0] = prevImg;
nextImg.convertTo(nextPyr_[0], CV_32F, stream);
for (int level = 1; level <= maxLevel_; ++level)
{
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
}
BufferPool pool(stream);
GpuMat uPyr[] = {
pool.getBuffer(prevImg.size(), CV_32FC1),
pool.getBuffer(prevImg.size(), CV_32FC1),
};
GpuMat vPyr[] = {
pool.getBuffer(prevImg.size(), CV_32FC1),
pool.getBuffer(prevImg.size(), CV_32FC1),
};
uPyr[0].setTo(Scalar::all(0), stream);
vPyr[0].setTo(Scalar::all(0), stream);
uPyr[1].setTo(Scalar::all(0), stream);
vPyr[1].setTo(Scalar::all(0), stream);
int2 winSize2i = make_int2(winSize_.width, winSize_.height);
pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream));
int idx = 0;
for (int level = maxLevel_; level >= 0; level--)
{
int idx2 = (idx + 1) & 1;
pyrlk::dense(prevPyr_[level], nextPyr_[level],
uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
PtrStepSzf(), winSize2i,
StreamAccessor::getStream(stream));
if (level > 0)
idx = idx2;
}
uPyr[idx].copyTo(u, stream);
vPyr[idx].copyTo(v, stream);
}
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
uPyr_[0].setTo(Scalar::all(0));
vPyr_[0].setTo(Scalar::all(0));
uPyr_[1].setTo(Scalar::all(0));
vPyr_[1].setTo(Scalar::all(0));
int2 winSize2i = make_int2(winSize.width, winSize.height);
pyrlk::loadConstants(winSize2i, iters);
PtrStepSzf derr = err ? *err : PtrStepSzf();
int idx = 0;
for (int level = maxLevel; level >= 0; level--)
class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase
{
int idx2 = (idx + 1) & 1;
public:
SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
{
}
pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
level == 0 ? derr : PtrStepSzf(), winSize2i);
virtual Size getWinSize() const { return winSize_; }
virtual void setWinSize(Size winSize) { winSize_ = winSize; }
if (level > 0)
idx = idx2;
}
virtual int getMaxLevel() const { return maxLevel_; }
virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
uPyr_[idx].copyTo(u);
vPyr_[idx].copyTo(v);
virtual int getNumIters() const { return iters_; }
virtual void setNumIters(int iters) { iters_ = iters; }
virtual bool getUseInitialFlow() const { return useInitialFlow_; }
virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
virtual void calc(InputArray _prevImg, InputArray _nextImg,
InputArray _prevPts, InputOutputArray _nextPts,
OutputArray _status,
OutputArray _err,
Stream& stream)
{
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
const GpuMat prevPts = _prevPts.getGpuMat();
GpuMat& nextPts = _nextPts.getGpuMatRef();
GpuMat& status = _status.getGpuMatRef();
GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
}
};
class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase
{
public:
DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
{
}
virtual Size getWinSize() const { return winSize_; }
virtual void setWinSize(Size winSize) { winSize_ = winSize; }
virtual int getMaxLevel() const { return maxLevel_; }
virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
virtual int getNumIters() const { return iters_; }
virtual void setNumIters(int iters) { iters_ = iters; }
virtual bool getUseInitialFlow() const { return useInitialFlow_; }
virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream)
{
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
BufferPool pool(stream);
GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1);
GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1);
dense(prevImg, nextImg, u, v, stream);
GpuMat flows[] = {u, v};
cuda::merge(flows, 2, _flow, stream);
}
};
}
void cv::cuda::PyrLKOpticalFlow::releaseMemory()
Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
{
prevPyr_.clear();
nextPyr_.clear();
return makePtr<SparsePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
}
buf_.release();
uPyr_[0].release();
vPyr_[0].release();
uPyr_[1].release();
vPyr_[1].release();
Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
{
return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
}
#endif /* !defined (HAVE_CUDA) */

View File

@ -44,256 +44,338 @@
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA() { throw_no_cuda(); }
void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage() {}
void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(double, double, double, int, int, double, int, double, double, bool) { throw_no_cuda(); return Ptr<OpticalFlowDual_TVL1>(); }
#else
using namespace cv;
using namespace cv::cuda;
cv::cuda::OpticalFlowDual_TVL1_CUDA::OpticalFlowDual_TVL1_CUDA()
{
tau = 0.25;
lambda = 0.15;
theta = 0.3;
nscales = 5;
warps = 5;
epsilon = 0.01;
iterations = 300;
scaleStep = 0.8;
gamma = 0.0;
useInitialFlow = false;
}
void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
{
CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
CV_Assert( I0.size() == I1.size() );
CV_Assert( I0.type() == I1.type() );
CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
CV_Assert( nscales > 0 );
// allocate memory for the pyramid structure
I0s.resize(nscales);
I1s.resize(nscales);
u1s.resize(nscales);
u2s.resize(nscales);
u3s.resize(nscales);
I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
if (!useInitialFlow)
{
flowx.create(I0.size(), CV_32FC1);
flowy.create(I0.size(), CV_32FC1);
}
u1s[0] = flowx;
u2s[0] = flowy;
if (gamma)
u3s[0].create(I0.size(), CV_32FC1);
I1x_buf.create(I0.size(), CV_32FC1);
I1y_buf.create(I0.size(), CV_32FC1);
I1w_buf.create(I0.size(), CV_32FC1);
I1wx_buf.create(I0.size(), CV_32FC1);
I1wy_buf.create(I0.size(), CV_32FC1);
grad_buf.create(I0.size(), CV_32FC1);
rho_c_buf.create(I0.size(), CV_32FC1);
p11_buf.create(I0.size(), CV_32FC1);
p12_buf.create(I0.size(), CV_32FC1);
p21_buf.create(I0.size(), CV_32FC1);
p22_buf.create(I0.size(), CV_32FC1);
if (gamma)
{
p31_buf.create(I0.size(), CV_32FC1);
p32_buf.create(I0.size(), CV_32FC1);
}
diff_buf.create(I0.size(), CV_32FC1);
// create the scales
for (int s = 1; s < nscales; ++s)
{
cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
if (I0s[s].cols < 16 || I0s[s].rows < 16)
{
nscales = s;
break;
}
if (useInitialFlow)
{
cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
}
else
{
u1s[s].create(I0s[s].size(), CV_32FC1);
u2s[s].create(I0s[s].size(), CV_32FC1);
}
if (gamma)
u3s[s].create(I0s[s].size(), CV_32FC1);
}
if (!useInitialFlow)
{
u1s[nscales-1].setTo(Scalar::all(0));
u2s[nscales-1].setTo(Scalar::all(0));
}
if (gamma)
u3s[nscales - 1].setTo(Scalar::all(0));
// pyramidal structure for computing the optical flow
for (int s = nscales - 1; s >= 0; --s)
{
// compute the optical flow at the current scale
procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s]);
// if this was the last scale, finish now
if (s == 0)
break;
// otherwise, upsample the optical flow
// zoom the optical flow for the next finer scale
cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
if (gamma)
cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());
// scale the optical flow with the appropriate zoom factor
cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
}
}
namespace tvl1flow
{
void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream);
void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
PtrStepSzf u1, PtrStepSzf u2,
PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy,
PtrStepSzf grad, PtrStepSzf rho,
cudaStream_t stream);
void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
PtrStepSzf grad, PtrStepSzf rho_c,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
float l_t, float theta, float gamma, bool calcError);
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, const float gamma);
float l_t, float theta, float gamma, bool calcError,
cudaStream_t stream);
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
float taut, float gamma,
cudaStream_t stream);
}
void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3)
namespace
{
using namespace tvl1flow;
const double scaledEpsilon = epsilon * epsilon * I0.size().area();
CV_DbgAssert( I1.size() == I0.size() );
CV_DbgAssert( I1.type() == I0.type() );
CV_DbgAssert( u1.size() == I0.size() );
CV_DbgAssert( u2.size() == u1.size() );
GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
centeredGradient(I1, I1x, I1y);
GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p31, p32;
if (gamma)
class OpticalFlowDual_TVL1_Impl : public OpticalFlowDual_TVL1
{
p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
}
p11.setTo(Scalar::all(0));
p12.setTo(Scalar::all(0));
p21.setTo(Scalar::all(0));
p22.setTo(Scalar::all(0));
if (gamma)
{
p31.setTo(Scalar::all(0));
p32.setTo(Scalar::all(0));
}
GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
const float l_t = static_cast<float>(lambda * theta);
const float taut = static_cast<float>(tau / theta);
for (int warpings = 0; warpings < warps; ++warpings)
{
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
double error = std::numeric_limits<double>::max();
double prevError = 0.0;
for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
public:
OpticalFlowDual_TVL1_Impl(double tau, double lambda, double theta, int nscales, int warps, double epsilon,
int iterations, double scaleStep, double gamma, bool useInitialFlow) :
tau_(tau), lambda_(lambda), gamma_(gamma), theta_(theta), nscales_(nscales), warps_(warps),
epsilon_(epsilon), iterations_(iterations), scaleStep_(scaleStep), useInitialFlow_(useInitialFlow)
{
// some tweaks to make sum operation less frequently
bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta), gamma, calcError);
if (calcError)
}
virtual double getTau() const { return tau_; }
virtual void setTau(double tau) { tau_ = tau; }
virtual double getLambda() const { return lambda_; }
virtual void setLambda(double lambda) { lambda_ = lambda; }
virtual double getGamma() const { return gamma_; }
virtual void setGamma(double gamma) { gamma_ = gamma; }
virtual double getTheta() const { return theta_; }
virtual void setTheta(double theta) { theta_ = theta; }
virtual int getNumScales() const { return nscales_; }
virtual void setNumScales(int nscales) { nscales_ = nscales; }
virtual int getNumWarps() const { return warps_; }
virtual void setNumWarps(int warps) { warps_ = warps; }
virtual double getEpsilon() const { return epsilon_; }
virtual void setEpsilon(double epsilon) { epsilon_ = epsilon; }
virtual int getNumIterations() const { return iterations_; }
virtual void setNumIterations(int iterations) { iterations_ = iterations; }
virtual double getScaleStep() const { return scaleStep_; }
virtual void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
virtual bool getUseInitialFlow() const { return useInitialFlow_; }
virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
private:
double tau_;
double lambda_;
double gamma_;
double theta_;
int nscales_;
int warps_;
double epsilon_;
int iterations_;
double scaleStep_;
bool useInitialFlow_;
private:
void calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream);
void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& stream);
std::vector<GpuMat> I0s;
std::vector<GpuMat> I1s;
std::vector<GpuMat> u1s;
std::vector<GpuMat> u2s;
std::vector<GpuMat> u3s;
GpuMat I1x_buf;
GpuMat I1y_buf;
GpuMat I1w_buf;
GpuMat I1wx_buf;
GpuMat I1wy_buf;
GpuMat grad_buf;
GpuMat rho_c_buf;
GpuMat p11_buf;
GpuMat p12_buf;
GpuMat p21_buf;
GpuMat p22_buf;
GpuMat p31_buf;
GpuMat p32_buf;
GpuMat diff_buf;
GpuMat norm_buf;
};
void OpticalFlowDual_TVL1_Impl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
{
const GpuMat frame0 = _frame0.getGpuMat();
const GpuMat frame1 = _frame1.getGpuMat();
BufferPool pool(stream);
GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
calcImpl(frame0, frame1, flowx, flowy, stream);
GpuMat flows[] = {flowx, flowy};
cuda::merge(flows, 2, _flow, stream);
}
void OpticalFlowDual_TVL1_Impl::calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream)
{
CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
CV_Assert( I0.size() == I1.size() );
CV_Assert( I0.type() == I1.type() );
CV_Assert( !useInitialFlow_ || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
CV_Assert( nscales_ > 0 );
// allocate memory for the pyramid structure
I0s.resize(nscales_);
I1s.resize(nscales_);
u1s.resize(nscales_);
u2s.resize(nscales_);
u3s.resize(nscales_);
I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0, stream);
I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0, stream);
if (!useInitialFlow_)
{
flowx.create(I0.size(), CV_32FC1);
flowy.create(I0.size(), CV_32FC1);
}
u1s[0] = flowx;
u2s[0] = flowy;
if (gamma_)
{
u3s[0].create(I0.size(), CV_32FC1);
}
I1x_buf.create(I0.size(), CV_32FC1);
I1y_buf.create(I0.size(), CV_32FC1);
I1w_buf.create(I0.size(), CV_32FC1);
I1wx_buf.create(I0.size(), CV_32FC1);
I1wy_buf.create(I0.size(), CV_32FC1);
grad_buf.create(I0.size(), CV_32FC1);
rho_c_buf.create(I0.size(), CV_32FC1);
p11_buf.create(I0.size(), CV_32FC1);
p12_buf.create(I0.size(), CV_32FC1);
p21_buf.create(I0.size(), CV_32FC1);
p22_buf.create(I0.size(), CV_32FC1);
if (gamma_)
{
p31_buf.create(I0.size(), CV_32FC1);
p32_buf.create(I0.size(), CV_32FC1);
}
diff_buf.create(I0.size(), CV_32FC1);
// create the scales
for (int s = 1; s < nscales_; ++s)
{
cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
if (I0s[s].cols < 16 || I0s[s].rows < 16)
{
error = cuda::sum(diff, norm_buf)[0];
prevError = error;
nscales_ = s;
break;
}
if (useInitialFlow_)
{
cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
cuda::multiply(u1s[s], Scalar::all(scaleStep_), u1s[s], 1, -1, stream);
cuda::multiply(u2s[s], Scalar::all(scaleStep_), u2s[s], 1, -1, stream);
}
else
{
error = std::numeric_limits<double>::max();
prevError -= scaledEpsilon;
u1s[s].create(I0s[s].size(), CV_32FC1);
u2s[s].create(I0s[s].size(), CV_32FC1);
}
if (gamma_)
{
u3s[s].create(I0s[s].size(), CV_32FC1);
}
}
if (!useInitialFlow_)
{
u1s[nscales_-1].setTo(Scalar::all(0), stream);
u2s[nscales_-1].setTo(Scalar::all(0), stream);
}
if (gamma_)
{
u3s[nscales_ - 1].setTo(Scalar::all(0), stream);
}
// pyramidal structure for computing the optical flow
for (int s = nscales_ - 1; s >= 0; --s)
{
// compute the optical flow at the current scale
procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s], stream);
// if this was the last scale, finish now
if (s == 0)
break;
// otherwise, upsample the optical flow
// zoom the optical flow for the next finer scale
cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
if (gamma_)
{
cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
}
estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
// scale the optical flow with the appropriate zoom factor
cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep_), u1s[s - 1], 1, -1, stream);
cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep_), u2s[s - 1], 1, -1, stream);
}
}
void OpticalFlowDual_TVL1_Impl::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& _stream)
{
using namespace tvl1flow;
cudaStream_t stream = StreamAccessor::getStream(_stream);
const double scaledEpsilon = epsilon_ * epsilon_ * I0.size().area();
CV_DbgAssert( I1.size() == I0.size() );
CV_DbgAssert( I1.type() == I0.type() );
CV_DbgAssert( u1.size() == I0.size() );
CV_DbgAssert( u2.size() == u1.size() );
GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
centeredGradient(I1, I1x, I1y, stream);
GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p31, p32;
if (gamma_)
{
p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
}
p11.setTo(Scalar::all(0), _stream);
p12.setTo(Scalar::all(0), _stream);
p21.setTo(Scalar::all(0), _stream);
p22.setTo(Scalar::all(0), _stream);
if (gamma_)
{
p31.setTo(Scalar::all(0), _stream);
p32.setTo(Scalar::all(0), _stream);
}
GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
const float l_t = static_cast<float>(lambda_ * theta_);
const float taut = static_cast<float>(tau_ / theta_);
for (int warpings = 0; warpings < warps_; ++warpings)
{
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c, stream);
double error = std::numeric_limits<double>::max();
double prevError = 0.0;
for (int n = 0; error > scaledEpsilon && n < iterations_; ++n)
{
// some tweaks to make sum operation less frequently
bool calcError = (epsilon_ > 0) && (n & 0x1) && (prevError < scaledEpsilon);
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta_), gamma_, calcError, stream);
if (calcError)
{
_stream.waitForCompletion();
error = cuda::sum(diff, norm_buf)[0];
prevError = error;
}
else
{
error = std::numeric_limits<double>::max();
prevError -= scaledEpsilon;
}
estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma_, stream);
}
}
}
}
void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage()
Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(
double tau, double lambda, double theta, int nscales, int warps,
double epsilon, int iterations, double scaleStep, double gamma, bool useInitialFlow)
{
I0s.clear();
I1s.clear();
u1s.clear();
u2s.clear();
u3s.clear();
I1x_buf.release();
I1y_buf.release();
I1w_buf.release();
I1wx_buf.release();
I1wy_buf.release();
grad_buf.release();
rho_c_buf.release();
p11_buf.release();
p12_buf.release();
p21_buf.release();
p22_buf.release();
if (gamma)
{
p31_buf.release();
p32_buf.release();
}
diff_buf.release();
norm_buf.release();
return makePtr<OpticalFlowDual_TVL1_Impl>(tau, lambda, theta, nscales, warps,
epsilon, iterations, scaleStep, gamma, useInitialFlow);
}
#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)