mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #11483 from nglee:dev_cudaCannyStreamIssue
cuda_canny : multi stream safety (#11483) * CUDA_ImgProc/Canny Asynchronous test * cuda_canny : multi stream safety (1/3) - Convert global variable canny::counter to class local variable * cuda_canny : multi stream safety (2/3) - Use texture objects rather than texture reference for cc >= 3.0, since texture reference must be declared as a static global variable which results in race condition when ran concurrently * cuda_canny : multi stream safety (3/3) - Refrain from using global variable in row_filter and column_filter (converts column_filter::c_kernel and row_filter::c_kernel to local variables) * Fixes #11193
This commit is contained in:
parent
959a12cbac
commit
ed86bd34b1
@ -52,10 +52,8 @@ namespace column_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 16;
|
||||
@ -135,7 +133,7 @@ namespace column_filter
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
|
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
@ -143,7 +141,7 @@ namespace column_filter
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
@ -167,7 +165,7 @@ namespace column_filter
|
||||
|
||||
B<T> brd(src.rows);
|
||||
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
@ -181,7 +179,7 @@ namespace filter
|
||||
template <typename T, typename D>
|
||||
void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
@ -362,11 +360,6 @@ namespace filter
|
||||
}
|
||||
};
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
|
||||
}
|
||||
}
|
||||
|
@ -52,10 +52,8 @@ namespace row_filter
|
||||
{
|
||||
#define MAX_KERNEL_SIZE 32
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
template <int KSIZE, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
|
||||
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
|
||||
{
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int BLOCK_DIM_X = 32;
|
||||
@ -135,7 +133,7 @@ namespace row_filter
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < KSIZE; ++k)
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];
|
||||
|
||||
dst(y, x) = saturate_cast<D>(sum);
|
||||
}
|
||||
@ -143,7 +141,7 @@ namespace row_filter
|
||||
}
|
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B>
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
|
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
|
||||
{
|
||||
int BLOCK_DIM_X;
|
||||
int BLOCK_DIM_Y;
|
||||
@ -167,7 +165,7 @@ namespace row_filter
|
||||
|
||||
B<T> brd(src.cols);
|
||||
|
||||
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
|
||||
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
@ -180,7 +178,7 @@ namespace filter
|
||||
template <typename T, typename D>
|
||||
void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
|
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5][33] =
|
||||
{
|
||||
@ -361,11 +359,6 @@ namespace filter
|
||||
}
|
||||
};
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
|
||||
}
|
||||
}
|
||||
|
@ -58,9 +58,9 @@ namespace canny
|
||||
|
||||
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
|
||||
|
||||
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
|
||||
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);
|
||||
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);
|
||||
|
||||
void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
@ -127,6 +127,8 @@ namespace
|
||||
Ptr<Filter> filterDX_, filterDY_;
|
||||
#endif
|
||||
int old_apperture_size_;
|
||||
|
||||
int *d_counter;
|
||||
};
|
||||
|
||||
void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
|
||||
@ -218,12 +220,17 @@ namespace
|
||||
|
||||
void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
|
||||
{
|
||||
map_.setTo(Scalar::all(0));
|
||||
map_.setTo(Scalar::all(0), stream);
|
||||
|
||||
canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
|
||||
|
||||
canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
|
||||
cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
|
||||
|
||||
canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
|
||||
canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
|
||||
|
||||
canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
|
||||
|
||||
cudaSafeCall( cudaFree(d_counter) );
|
||||
|
||||
canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
@ -47,6 +47,7 @@
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
#include "opencv2/core/cuda.hpp"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
@ -102,6 +103,20 @@ namespace canny
|
||||
}
|
||||
};
|
||||
|
||||
struct SrcTexObject
|
||||
{
|
||||
int xoff;
|
||||
int yoff;
|
||||
cudaTextureObject_t tex_src_object;
|
||||
__host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const
|
||||
{
|
||||
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
@ -120,11 +135,75 @@ namespace canny
|
||||
mag(y, x) = norm(dxVal, dyVal);
|
||||
}
|
||||
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y >= mag.rows || x >= mag.cols)
|
||||
return;
|
||||
|
||||
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
|
||||
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
|
||||
|
||||
dx(y, x) = dxVal;
|
||||
dy(y, x) = dyVal;
|
||||
|
||||
mag(y, x) = norm(dxVal, dyVal);
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
|
||||
|
||||
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
|
||||
|
||||
if (cc30)
|
||||
{
|
||||
cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = srcWhole.ptr();
|
||||
resDesc.res.pitch2D.height = srcWhole.rows;
|
||||
resDesc.res.pitch2D.width = srcWhole.cols;
|
||||
resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
|
||||
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
|
||||
|
||||
cudaTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[1] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[2] = cudaAddressModeClamp;
|
||||
|
||||
cudaTextureObject_t tex = 0;
|
||||
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
|
||||
|
||||
SrcTexObject src(xoff, yoff, tex);
|
||||
|
||||
if (L2Grad)
|
||||
{
|
||||
L2 norm;
|
||||
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
else
|
||||
{
|
||||
L1 norm;
|
||||
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == NULL)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
else
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
cudaSafeCall( cudaDestroyTextureObject(tex) );
|
||||
}
|
||||
else
|
||||
{
|
||||
bindTexture(&tex_src, srcWhole);
|
||||
SrcTex src(xoff, yoff);
|
||||
|
||||
@ -144,6 +223,7 @@ namespace canny
|
||||
if (stream == NULL)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
|
||||
{
|
||||
@ -165,7 +245,6 @@ namespace canny
|
||||
namespace canny
|
||||
{
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
|
||||
{
|
||||
const int CANNY_SHIFT = 15;
|
||||
@ -218,33 +297,116 @@ namespace canny
|
||||
map(y, x) = edge_type;
|
||||
}
|
||||
|
||||
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
|
||||
{
|
||||
const int CANNY_SHIFT = 15;
|
||||
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
|
||||
return;
|
||||
|
||||
int dxVal = dx(y, x);
|
||||
int dyVal = dy(y, x);
|
||||
|
||||
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
|
||||
const float m = tex2D<float>(tex_mag, x, y);
|
||||
|
||||
dxVal = ::abs(dxVal);
|
||||
dyVal = ::abs(dyVal);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
const int tg22x = dxVal * TG22;
|
||||
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
|
||||
|
||||
dyVal <<= CANNY_SHIFT;
|
||||
|
||||
if (dyVal < tg22x)
|
||||
{
|
||||
if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if(dyVal > tg67x)
|
||||
{
|
||||
if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
map(y, x) = edge_type;
|
||||
}
|
||||
|
||||
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
|
||||
|
||||
bindTexture(&tex_mag, mag);
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_30))
|
||||
{
|
||||
// Use the texture object
|
||||
cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = mag.ptr();
|
||||
resDesc.res.pitch2D.height = mag.rows;
|
||||
resDesc.res.pitch2D.width = mag.cols;
|
||||
resDesc.res.pitch2D.pitchInBytes = mag.step;
|
||||
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
|
||||
|
||||
cudaTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[1] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[2] = cudaAddressModeClamp;
|
||||
|
||||
cudaTextureObject_t tex=0;
|
||||
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
|
||||
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == NULL)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
else
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
cudaSafeCall( cudaDestroyTextureObject(tex) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use the texture reference
|
||||
bindTexture(&tex_mag, mag);
|
||||
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == NULL)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
__device__ int counter = 0;
|
||||
|
||||
__device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
|
||||
{
|
||||
return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
|
||||
}
|
||||
|
||||
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
|
||||
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
|
||||
{
|
||||
__shared__ volatile int smem[18][18];
|
||||
|
||||
@ -325,22 +487,19 @@ namespace canny
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const int ind = ::atomicAdd(&counter, 1);
|
||||
const int ind = ::atomicAdd(d_counter, 1);
|
||||
st[ind] = make_short2(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
|
||||
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
|
||||
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
|
||||
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
|
||||
|
||||
edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
|
||||
edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == NULL)
|
||||
@ -355,7 +514,7 @@ namespace canny
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
|
||||
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
|
||||
{
|
||||
const int stack_size = 512;
|
||||
|
||||
@ -429,7 +588,7 @@ namespace canny
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
s_ind = ::atomicAdd(&counter, s_counter);
|
||||
s_ind = ::atomicAdd(d_counter, s_counter);
|
||||
|
||||
if (s_ind + s_counter > map.cols * map.rows)
|
||||
s_counter = 0;
|
||||
@ -444,29 +603,26 @@ namespace canny
|
||||
}
|
||||
}
|
||||
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
|
||||
|
||||
int count;
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
|
||||
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
|
||||
|
||||
const dim3 block(128);
|
||||
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
|
||||
|
||||
edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
|
||||
edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == NULL)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
|
||||
cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
|
||||
cudaSafeCall( cudaStreamSynchronize(stream) );
|
||||
|
||||
count = min(count, map.cols * map.rows);
|
||||
|
@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy)
|
||||
EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
|
||||
}
|
||||
|
||||
class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
|
||||
: d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
|
||||
~CannyAsyncParallelLoopBody() {};
|
||||
void operator()(const cv::Range& r) const
|
||||
{
|
||||
for (int i = r.start; i < r.end; i++) {
|
||||
cv::cuda::Stream stream;
|
||||
cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
|
||||
canny->detect(d_img, edges[i], stream);
|
||||
stream.waitForCompletion();
|
||||
}
|
||||
}
|
||||
protected:
|
||||
const cv::cuda::GpuMat& d_img;
|
||||
cv::cuda::GpuMat* edges;
|
||||
double low_thresh;
|
||||
double high_thresh;
|
||||
int apperture_size;
|
||||
bool useL2gradient;
|
||||
};
|
||||
|
||||
#define NUM_STREAMS 64
|
||||
|
||||
CUDA_TEST_P(Canny, Async)
|
||||
{
|
||||
if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
|
||||
{
|
||||
throw SkipTestException("CUDA device doesn't support texture objects");
|
||||
}
|
||||
else
|
||||
{
|
||||
const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
|
||||
|
||||
double low_thresh = 50.0;
|
||||
double high_thresh = 100.0;
|
||||
|
||||
// Synchronous call
|
||||
cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
|
||||
cv::cuda::GpuMat edges_gold;
|
||||
canny->detect(d_img_roi, edges_gold);
|
||||
|
||||
// Asynchronous call
|
||||
cv::cuda::GpuMat edges[NUM_STREAMS];
|
||||
cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
|
||||
|
||||
// Compare the results of synchronous call and asynchronous call
|
||||
for (int i = 0; i < NUM_STREAMS; i++)
|
||||
EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
testing::Values(AppertureSize(3), AppertureSize(5)),
|
||||
testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
|
||||
testing::Values(L2gradient(false), L2gradient(true)),
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user