added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs

This commit is contained in:
Alexey Spizhevoy 2010-11-26 17:12:48 +00:00
parent e470246ab5
commit 7e2cc1be1b
4 changed files with 237 additions and 9 deletions

View File

@ -434,6 +434,11 @@ namespace cv
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
GpuMat& valbuf, GpuMat& locbuf);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src);
//! counts non-zero array elements
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
//! destination array will have the depth type as lut and the same channels number as source

View File

@ -69,6 +69,8 @@ void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); }
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }
@ -527,7 +529,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
int major, minor;
getComputeCapability(getDevice(), major, minor);
if (major >= 1 && minor >= 1)
if (major > 1 || (major == 1 && minor >= 1))
{
switch (src_.type())
{
@ -538,7 +540,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;
case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;
case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type");
default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
}
}
else
@ -551,7 +553,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;
case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;
case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type");
default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
}
}
}
@ -601,7 +603,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int major, minor;
getComputeCapability(getDevice(), major, minor);
if (major >= 1 && minor >= 1)
if (major > 1 || (major == 1 && minor >= 1))
{
switch (src.type())
{
@ -612,7 +614,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
case CV_32S: min_max_loc_caller<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
case CV_32F: min_max_loc_caller<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
case CV_64F: min_max_loc_caller<double>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type");
default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
}
}
else
@ -625,7 +627,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
case CV_16S: min_max_loc_caller_2steps<signed short>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
case CV_32S: min_max_loc_caller_2steps<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
case CV_32F: min_max_loc_caller_2steps<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
default: CV_Error(CV_StsBadArg, "Unsupported type");
default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
}
}
@ -633,6 +635,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
}
////////////////////////////////////////////////////////////////////////
// Count non zero
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int& cols, int& rows);
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf);
template <typename T>
int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf);
}}}}
int cv::gpu::countNonZero(const GpuMat& src)
{
GpuMat buf;
return countNonZero(src, buf);
}
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
using namespace mathfunc::countnonzero;
CV_Assert(src.channels() == 1);
Size buf_size;
get_buf_size_required(buf_size.width, buf_size.height);
buf.create(buf_size, CV_8U);
switch (src.type())
{
case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);
case CV_8S: return count_non_zero_caller<signed char>(src, buf);
case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);
case CV_16S: return count_non_zero_caller<signed short>(src, buf);
case CV_32S: return count_non_zero_caller<int>(src, buf);
case CV_32F: return count_non_zero_caller<float>(src, buf);
case CV_64F: return count_non_zero_caller<double>(src, buf);
}
CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
return 0;
}
////////////////////////////////////////////////////////////////////////
// LUT

View File

@ -615,6 +615,8 @@ namespace cv { namespace gpu { namespace mathfunc
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
@ -868,4 +870,126 @@ namespace cv { namespace gpu { namespace mathfunc
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
static const unsigned int czero = 0;
__device__ unsigned int blocks_finished;
void estimate_thread_cfg(dim3& threads, dim3& grid)
{
threads = dim3(64, 4);
grid = dim3(6, 5);
}
void get_buf_size_required(int& cols, int& rows)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
cols = grid.x * grid.y * sizeof(int);
rows = 1;
}
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
{
__shared__ unsigned int scount[nthreads];
unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int cnt = 0;
for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
if (tid == 0) count[0] = scount[0];
}
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<unsigned char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<signed char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<unsigned short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<signed short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
} // namespace countnonzero
}}}

View File

@ -689,9 +689,7 @@ struct CV_GpuMinMaxTest: public CvTest
for (int cn = 1; cn <= 4; ++cn)
for (int depth = CV_8U; depth <= depth_end; ++depth)
{
int rows = 1, cols = 3;
test(rows, cols, cn, depth);
for (int i = 0; i < 4; ++i)
for (int i = 0; i < 1; ++i)
{
int rows = 1 + rand() % 1000;
int cols = 1 + rand() % 1000;
@ -821,6 +819,59 @@ struct CV_GpuMinMaxLocTest: public CvTest
}
};
////////////////////////////////////////////////////////////////////////////
// Count non zero
struct CV_GpuCountNonZeroTest: CvTest
{
CV_GpuCountNonZeroTest(): CvTest("GPU-CountNonZeroTest", "countNonZero") {}
void run(int)
{
srand(0);
int depth_end;
int major, minor;
cv::gpu::getComputeCapability(getDevice(), major, minor);
if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;
for (int depth = CV_8U; depth <= depth_end; ++depth)
{
for (int i = 0; i < 4; ++i)
{
int rows = 1 + rand() % 1000;
int cols = 1 + rand() % 1000;
test(rows, cols, depth);
}
}
}
void test(int rows, int cols, int depth)
{
cv::Mat src(rows, cols, depth);
cv::RNG rng;
if (depth == 5)
rng.fill(src, RNG::UNIFORM, Scalar(-1000.f), Scalar(1000.f));
else if (depth == 6)
rng.fill(src, RNG::UNIFORM, Scalar(-1000.), Scalar(1000.));
else
for (int i = 0; i < src.rows; ++i)
{
Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i));
rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255));
}
int n_gold = cv::countNonZero(src);
int n = cv::gpu::countNonZero(cv::gpu::GpuMat(src));
if (n != n_gold)
{
ts->printf(CvTS::CONSOLE, "%d %d %d %d %d\n", n, n_gold, depth, cols, rows);
n_gold = cv::countNonZero(src);
}
CHECK(n == n_gold, CvTS::FAIL_INVALID_OUTPUT);
}
};
/////////////////////////////////////////////////////////////////////////////
/////////////////// tests registration /////////////////////////////////////
@ -850,3 +901,4 @@ CV_GpuNppImageCartToPolarTest CV_GpuNppImageCartToPolar_test;
CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
CV_GpuCountNonZeroTest CV_CountNonZero_test;