diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu index e3cca57f3a..901ab6d69b 100644 --- a/modules/cudaoptflow/src/cuda/pyrlk.cu +++ b/modules/cudaoptflow/src/cuda/pyrlk.cu @@ -1050,16 +1050,45 @@ namespace pyrlk } } - void loadConstants(int2 winSize, int iters, cudaStream_t stream) + void loadWinSize(int* winSize, int* halfWinSize, cudaStream_t stream) { - cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); - cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, winSize, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, winSize + 1, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); - int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); - cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); - cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, halfWinSize, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, halfWinSize + 1, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + } - cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + void loadIters(int* iters, cudaStream_t stream) + { + cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + } + + void loadConstants(int2 winSize_, int iters_, cudaStream_t stream) + { + static int2 winSize = make_int2(0,0); + if(winSize.x != winSize_.x || winSize.y != winSize_.y) + { + winSize = winSize_; + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + } + + static int2 halfWin = make_int2(0,0); + int2 half = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); + if(halfWin.x != half.x || halfWin.y != half.y) + { + halfWin = half; + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + } + + static int iters = 0; + if(iters != iters_) + { + iters = iters_; + cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); + } } template struct pyrLK_caller diff --git a/modules/cudaoptflow/src/pyrlk.cpp b/modules/cudaoptflow/src/pyrlk.cpp index d1704473c7..835d4d5c7b 100644 --- a/modules/cudaoptflow/src/pyrlk.cpp +++ b/modules/cudaoptflow/src/pyrlk.cpp @@ -55,7 +55,9 @@ Ptr cv::cuda::DensePyrLKOpticalFlow::create(Siz namespace pyrlk { - void loadConstants(int2 winSize, int iters, cudaStream_t stream); + void loadConstants(int* winSize, int iters, cudaStream_t stream); + void loadWinSize(int* winSize, int* halfWinSize, cudaStream_t stream); + void loadIters(int* iters, cudaStream_t stream); template struct pyrLK_caller { static void sparse(PtrStepSz::vec_type> I, PtrStepSz::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, @@ -88,7 +90,8 @@ namespace void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream); protected: - Size winSize_; + int winSize_[2]; + int halfWinSize_[2]; int maxLevel_; int iters_; bool useInitialFlow_; @@ -100,8 +103,11 @@ namespace }; PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) : - winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow) + winSize_({winSize.width, winSize.height}), halfWinSize_({(winSize.width - 1) / 2, (winSize.height - 1) / 2}), + maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow) { + pyrlk::loadWinSize(winSize_, halfWinSize_, 0); + pyrlk::loadIters(&iters_, 0); } void calcPatchSize(Size winSize, dim3& block, dim3& patch) @@ -148,7 +154,7 @@ namespace CV_Assert(prevPyr[0].size() == nextPyr[0].size()); CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2); CV_Assert(maxLevel_ >= 0); - CV_Assert(winSize_.width > 2 && winSize_.height > 2); + CV_Assert(winSize_[0] > 2 && winSize_[1] > 2); if (useInitialFlow_) CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type()); else @@ -171,9 +177,11 @@ namespace } dim3 block, patch; - calcPatchSize(winSize_, block, patch); + calcPatchSize(Size(winSize_[0], winSize_[1]), block, patch); CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); - pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream)); + cudaStream_t stream_ = StreamAccessor::getStream(stream); + pyrlk::loadWinSize(winSize_, halfWinSize_, stream_); + pyrlk::loadIters(&iters_, stream_); const int cn = prevPyr[0].channels(); const int type = prevPyr[0].depth(); @@ -185,12 +193,12 @@ namespace // while ushort does work, it has significantly worse performance, and thus doesn't pass accuracy tests. static const func_t funcs[6][4] = { - { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher }, - { /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0 }, - { pyrlk::dispatcher , /*pyrlk::dispatcher*/0, pyrlk::dispatcher , pyrlk::dispatcher }, - { /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/0 }, - { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher }, - { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher } + { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher }, + { /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0 , /*pyrlk::dispatcher*/ 0 }, + { pyrlk::dispatcher , /*pyrlk::dispatcher*/0, pyrlk::dispatcher , pyrlk::dispatcher }, + { /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0, /*pyrlk::dispatcher*/ 0 , /*pyrlk::dispatcher*/0 }, + { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher }, + { pyrlk::dispatcher , /*pyrlk::dispatcher*/ 0, pyrlk::dispatcher , pyrlk::dispatcher } }; func_t func = funcs[type][cn-1]; @@ -201,7 +209,7 @@ namespace prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, level, block, patch, - StreamAccessor::getStream(stream)); + stream_); } } @@ -229,7 +237,7 @@ namespace CV_Assert( prevImg.type() == CV_8UC1 ); CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() ); CV_Assert( maxLevel_ >= 0 ); - CV_Assert( winSize_.width > 2 && winSize_.height > 2 ); + CV_Assert( winSize_[0] > 2 && winSize_[1] > 2 ); // build the image pyramids. @@ -262,9 +270,11 @@ namespace vPyr[0].setTo(Scalar::all(0), stream); uPyr[1].setTo(Scalar::all(0), stream); vPyr[1].setTo(Scalar::all(0), stream); - - int2 winSize2i = make_int2(winSize_.width, winSize_.height); - pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream)); + cudaStream_t stream_ = StreamAccessor::getStream(stream); + pyrlk::loadWinSize(winSize_, halfWinSize_, stream_); + pyrlk::loadIters(&iters_, stream_); + int2 winSize2i = make_int2(winSize_[0], winSize_[1]); + //pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream)); int idx = 0; @@ -275,7 +285,7 @@ namespace pyrlk::pyrLK_caller::dense(prevPyr_[level], nextPyr_[level], uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2], PtrStepSzf(), winSize2i, - StreamAccessor::getStream(stream)); + stream_); if (level > 0) idx = idx2; @@ -293,8 +303,13 @@ namespace { } - virtual Size getWinSize() const { return winSize_; } - virtual void setWinSize(Size winSize) { winSize_ = winSize; } + virtual Size getWinSize() const { return cv::Size(winSize_[0], winSize_[1]); } + virtual void setWinSize(Size winSize) { + winSize_[0] = winSize.width; + winSize_[1] = winSize.height; + halfWinSize_[0] = (winSize.width - 1) / 2; + halfWinSize_[1] = (winSize.height -1) / 2; + } virtual int getMaxLevel() const { return maxLevel_; } virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; } @@ -339,8 +354,13 @@ namespace { } - virtual Size getWinSize() const { return winSize_; } - virtual void setWinSize(Size winSize) { winSize_ = winSize; } + virtual Size getWinSize() const { return cv::Size(winSize_[0], winSize_[1]); } + virtual void setWinSize(Size winSize) { + winSize_[0] = winSize.width; + winSize_[1] = winSize.height; + halfWinSize_[0] = (winSize.width - 1) / 2; + halfWinSize_[1] = (winSize.height -1) / 2; + } virtual int getMaxLevel() const { return maxLevel_; } virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }