fix CUDA support for streams for NMS; refactor tests

This commit is contained in:
marina.kolpakova 2012-11-26 17:53:25 +04:00
parent 60c0e41ba5
commit bd3179bda8
4 changed files with 78 additions and 51 deletions

View File

@ -134,7 +134,8 @@ namespace icf {
}
}
void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed)
void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
PtrStepSzb suppressed, cudaStream_t stream)
{
int block = 192;
int grid = 1;
@ -146,7 +147,7 @@ namespace icf {
overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
// if (!stream)
if (!stream)
{
cudaSafeCall( cudaGetLastError());
cudaSafeCall( cudaDeviceSynchronize());
@ -330,15 +331,15 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
template<typename Policy>
void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const
PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
{
int fw = roi.rows;
int fh = roi.cols;
dim3 grid(fw, fh / Policy::STA_Y, downscales);
uint* ctr = (uint*)(counter.ptr(0));
Detection* det = (Detection*)objects.ptr();
uint* ctr = (uint*)(objects.ptr(0));
Detection* det = ((Detection*)objects.ptr(0)) + 1;
uint max_det = objects.cols / sizeof(Detection);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
@ -363,7 +364,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
}
template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const;
PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
}
}}}

View File

@ -147,7 +147,7 @@ struct CascadeInvoker
int scales;
void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const;
const int downscales, const cudaStream_t& stream = 0) const;
template<bool isUp>
__device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;

View File

@ -86,7 +86,8 @@ namespace icf {
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
const int fw, const int fh, const int bins, cudaStream_t stream);
void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed);
void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
PtrStepSzb suppressed, cudaStream_t stream);
}
namespace imgproc {
@ -328,13 +329,20 @@ struct cv::gpu::SCascade::Fields
leaves.upload(hleaves);
}
void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, Stream& s) const
{
cudaMemset(count.data, 0, sizeof(Detection));
if (s)
s.enqueueMemSet(objects, 0);
else
cudaMemset(objects.data, 0, sizeof(Detection));
cudaSafeCall( cudaGetLastError());
device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
= device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
invoker(roi, hogluv, objects, count, downscales, stream);
cudaStream_t stream = StreamAccessor::getStream(s);
invoker(roi, hogluv, objects, downscales, stream);
}
void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
@ -356,6 +364,26 @@ struct cv::gpu::SCascade::Fields
integrate(fh, fw, s);
}
void suppress(GpuMat& objects, Stream& s)
{
GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
if (s)
{
s.enqueueMemSet(overlaps, 0);
s.enqueueMemSet(suppressed, 0);
}
else
{
overlaps.setTo(0);
suppressed.setTo(0);
}
cudaStream_t stream = StreamAccessor::getStream(s);
device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
}
private:
typedef std::vector<device::icf::Octave>::const_iterator octIt_t;
@ -442,17 +470,7 @@ private:
}
}
#include <iostream>
public:
void suppress(GpuMat& ndetections, GpuMat& objects)
{
ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
overlaps.setTo(0);
suppressed.setTo(0);
device::icf::suppress(objects, overlaps, ndetections, suppressed);
// std::cout << cv::Mat(overlaps) << std::endl;
}
// scales range
float minScale;
@ -547,20 +565,18 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
}
else
{
colored.copyTo(flds.hogluv);
if (s)
s.enqueueCopy(colored, flds.hogluv);
else
colored.copyTo(flds.hogluv);
}
GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1));
cudaStream_t stream = StreamAccessor::getStream(s);
flds.detect(rois, tmp, objects, stream);
flds.detect(rois, objects, s);
if (rejCriteria != NO_REJECT)
{
flds.suppress(tmp, objects);
GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
flds.suppress(objects, s);
flds.suppressed.copyTo(spr);
}
}

View File

@ -47,7 +47,7 @@
using cv::gpu::GpuMat;
// show detection results on input image with cv::imshow
#define SHOW_DETECTIONS
// #define SHOW_DETECTIONS
#if defined SHOW_DETECTIONS
# define SHOW(res) \
@ -99,6 +99,35 @@ namespace {
return std::string(s);
}
static void print(std::ostream &out, const Detection& d)
{
#if defined SHOW_DETECTIONS
out << "\x1b[32m[ detection]\x1b[0m ("
<< std::setw(4) << d.x
<< " "
<< std::setw(4) << d.y
<< ") ("
<< std::setw(4) << d.w
<< " "
<< std::setw(4) << d.h
<< ") "
<< std::setw(12) << d.confidence
<< std::endl;
#else
(void)out; (void)d;
#endif
}
static void printTotal(std::ostream &out, int detbytes)
{
#if defined SHOW_DETECTIONS
out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
#else
(void)out; (void)detbytes;
#endif
}
#if defined SHOW_DETECTIONS
static std::string getImageName(int level)
{
time_t rawtime;
@ -112,32 +141,13 @@ namespace {
return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
}
static void print(std::ostream &out, const Detection& d)
{
out << "\x1b[32m[ detection]\x1b[0m ("
<< std::setw(4) << d.x
<< " "
<< std::setw(4) << d.y
<< ") ("
<< std::setw(4) << d.w
<< " "
<< std::setw(4) << d.h
<< ") "
<< std::setw(12) << d.confidence
<< std::endl;
}
static void printTotal(std::ostream &out, int detbytes)
{
out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
}
static void writeResult(const cv::Mat& result, const int level)
{
std::string path = cv::tempfile(getImageName(level).c_str());
cv::imwrite(path, result);
std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
}
#endif
}
typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestRoi;