cuda optflow tvl1 : async safety

also modify cuda canny to use createTextureObjectPitch2D, etc.
This commit is contained in:
Namgoo Lee 2020-06-17 01:04:22 +09:00
parent 411ce04f54
commit 2043e06102
4 changed files with 138 additions and 54 deletions

View File

@ -101,6 +101,20 @@ namespace cv { namespace cuda
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
} }
template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
resDesc.res.pitch2D.height = img.rows;
resDesc.res.pitch2D.width = img.cols;
resDesc.res.pitch2D.pitchInBytes = img.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
}
} }
}} }}

View File

@ -90,53 +90,47 @@ namespace cv { namespace cuda { namespace device
namespace canny namespace canny
{ {
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTex struct SrcTex
{ {
int xoff; virtual ~SrcTex() {}
int yoff;
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
int xoff;
int yoff;
};
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const override
{ {
return tex2D(tex_src, x + xoff, y + yoff); return tex2D(tex_src, x + xoff, y + yoff);
} }
}; };
struct SrcTexObject struct SrcTexObj : SrcTex
{ {
int xoff; __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
int yoff;
cudaTextureObject_t tex_src_object;
__host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
__device__ __forceinline__ int operator ()(int y, int x) const __device__ __forceinline__ int operator ()(int y, int x) const override
{ {
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff); return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
} }
cudaTextureObject_t tex_src_object;
}; };
template <class Norm> __global__ template <
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) class T,
{ class Norm,
const int x = blockIdx.x * blockDim.x + threadIdx.x; typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
const int y = blockIdx.y * blockDim.y + threadIdx.y; >
__global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
if (y >= mag.rows || x >= mag.cols)
return;
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
dx(y, x) = dxVal;
dy(y, x) = dyVal;
mag(y, x) = norm(dxVal, dyVal);
}
template <class Norm> __global__
void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -162,15 +156,6 @@ namespace canny
if (cc30) if (cc30)
{ {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = srcWhole.ptr();
resDesc.res.pitch2D.height = srcWhole.rows;
resDesc.res.pitch2D.width = srcWhole.cols;
resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
cudaTextureDesc texDesc; cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc)); memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[0] = cudaAddressModeClamp;
@ -178,9 +163,9 @@ namespace canny
texDesc.addressMode[2] = cudaAddressModeClamp; texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex = 0; cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); createTextureObjectPitch2D(&tex, srcWhole, texDesc);
SrcTexObject src(xoff, yoff, tex); SrcTexObj src(xoff, yoff, tex);
if (L2Grad) if (L2Grad)
{ {
@ -205,7 +190,7 @@ namespace canny
else else
{ {
bindTexture(&tex_src, srcWhole); bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff); SrcTexRef src(xoff, yoff);
if (L2Grad) if (L2Grad)
{ {

View File

@ -116,7 +116,7 @@ protected:
bool useL2gradient; bool useL2gradient;
}; };
#define NUM_STREAMS 64 #define NUM_STREAMS 128
CUDA_TEST_P(Canny, Async) CUDA_TEST_P(Canny, Async)
{ {

View File

@ -45,6 +45,7 @@
#include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda.hpp"
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
@ -101,11 +102,64 @@ namespace tvl1flow
} }
} }
struct SrcTex
{
virtual ~SrcTex() {}
__device__ __forceinline__ virtual float I1(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
__device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
};
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTexRef : SrcTex
{
__device__ __forceinline__ float I1(float x, float y) const override
{
return tex2D(tex_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const override
{
return tex2D(tex_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const override
{
return tex2D(tex_I1y, x, y);
}
};
__global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) struct SrcTexObj : SrcTex
{
__host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
: tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
__device__ __forceinline__ float I1(float x, float y) const override
{
return tex2D<float>(tex_obj_I1, x, y);
}
__device__ __forceinline__ float I1x(float x, float y) const override
{
return tex2D<float>(tex_obj_I1x, x, y);
}
__device__ __forceinline__ float I1y(float x, float y) const override
{
return tex2D<float>(tex_obj_I1y, x, y);
}
cudaTextureObject_t tex_obj_I1;
cudaTextureObject_t tex_obj_I1x;
cudaTextureObject_t tex_obj_I1y;
};
template <
typename T,
typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
>
__global__ void warpBackwardKernel(
const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -136,9 +190,9 @@ namespace tvl1flow
{ {
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
sum += w * tex2D(tex_I1 , cx, cy); sum += w * src.I1(cx, cy);
sumx += w * tex2D(tex_I1x, cx, cy); sumx += w * src.I1x(cx, cy);
sumy += w * tex2D(tex_I1y, cx, cy); sumy += w * src.I1y(cx, cy);
wsum += w; wsum += w;
} }
@ -173,15 +227,46 @@ namespace tvl1flow
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
bindTexture(&tex_I1 , I1); bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); if (cc30)
cudaSafeCall( cudaGetLastError() ); {
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
if (!stream) cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
cudaSafeCall( cudaDeviceSynchronize() );
createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
else
cudaSafeCall(cudaStreamSynchronize(stream));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
}
else
{
bindTexture(&tex_I1, I1);
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall(cudaGetLastError());
if (!stream)
cudaSafeCall(cudaDeviceSynchronize());
}
} }
} }