mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 03:30:34 +08:00
added gpu::sqrSum function
This commit is contained in:
parent
b18a3a5f83
commit
c9f9f38777
@ -428,6 +428,14 @@ namespace cv
|
||||
//! supports only single channel images
|
||||
CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
|
||||
|
||||
//! computes squared sum of array elements
|
||||
//! supports only single channel images
|
||||
CV_EXPORTS Scalar sqrSum(const GpuMat& src);
|
||||
|
||||
//! computes squared sum of array elements
|
||||
//! supports only single channel images
|
||||
CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
|
||||
|
||||
//! finds global minimum and maximum array elements and returns their values
|
||||
CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
|
||||
|
||||
|
@ -66,6 +66,8 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return
|
||||
void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
|
||||
Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
|
||||
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
|
||||
Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
|
||||
Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
|
||||
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
|
||||
@ -489,6 +491,12 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template <typename T>
|
||||
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
|
||||
|
||||
template <typename T>
|
||||
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum);
|
||||
|
||||
template <typename T>
|
||||
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
|
||||
|
||||
namespace sum
|
||||
{
|
||||
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
|
||||
@ -527,6 +535,38 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
|
||||
return result;
|
||||
}
|
||||
|
||||
Scalar cv::gpu::sqrSum(const GpuMat& src)
|
||||
{
|
||||
GpuMat buf;
|
||||
return sqrSum(src, buf);
|
||||
}
|
||||
|
||||
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace mathfunc;
|
||||
CV_Assert(src.channels() == 1);
|
||||
|
||||
typedef void (*Caller)(const DevMem2D, PtrStep, double*);
|
||||
static const Caller callers[2][7] =
|
||||
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>,
|
||||
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>,
|
||||
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
|
||||
{ sqsum_caller<unsigned char>, sqsum_caller<char>,
|
||||
sqsum_caller<unsigned short>, sqsum_caller<short>,
|
||||
sqsum_caller<int>, sqsum_caller<float>, sqsum_caller<double> } };
|
||||
|
||||
Size bufSize;
|
||||
sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height);
|
||||
buf.create(bufSize, CV_8U);
|
||||
|
||||
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
|
||||
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
|
||||
|
||||
double result;
|
||||
caller(src, buf, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// minMax
|
||||
|
||||
|
@ -1428,6 +1428,12 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template <> struct SumType<float> { typedef float R; };
|
||||
template <> struct SumType<double> { typedef double R; };
|
||||
|
||||
template <typename R>
|
||||
struct IdentityOp { static __device__ R call(R x) { return x; } };
|
||||
|
||||
template <typename R>
|
||||
struct SqrOp { static __device__ R call(R x) { return x * x; } };
|
||||
|
||||
__constant__ int ctwidth;
|
||||
__constant__ int ctheight;
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
@ -1462,7 +1468,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
|
||||
}
|
||||
|
||||
template <typename T, typename R, int nthreads>
|
||||
template <typename T, typename R, typename Op, int nthreads>
|
||||
__global__ void sum_kernel(const DevMem2D_<T> src, R* result)
|
||||
{
|
||||
__shared__ R smem[nthreads];
|
||||
@ -1477,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
{
|
||||
const T* ptr = src.ptr(y0 + y * blockDim.y);
|
||||
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
|
||||
sum += ptr[x0 + x * blockDim.x];
|
||||
sum += Op::call(ptr[x0 + x * blockDim.x]);
|
||||
}
|
||||
|
||||
smem[tid] = sum;
|
||||
@ -1548,9 +1554,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
R* buf_ = (R*)buf.ptr(0);
|
||||
|
||||
sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
buf_, grid.x * grid.y);
|
||||
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result = 0;
|
||||
@ -1566,6 +1571,35 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
|
||||
|
||||
|
||||
template <typename T>
|
||||
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
|
||||
{
|
||||
using namespace sum;
|
||||
typedef typename SumType<T>::R R;
|
||||
|
||||
dim3 threads, grid;
|
||||
estimate_thread_cfg(src.cols, src.rows, threads, grid);
|
||||
set_kernel_consts(src.cols, src.rows, threads, grid);
|
||||
|
||||
R* buf_ = (R*)buf.ptr(0);
|
||||
|
||||
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(buf_, grid.x * grid.y);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result = 0;
|
||||
cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
|
||||
sum[0] = result;
|
||||
}
|
||||
|
||||
template void sqsum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
|
||||
|
||||
|
||||
template <typename T>
|
||||
void sum_caller(const DevMem2D src, PtrStep buf, double* sum)
|
||||
{
|
||||
@ -1578,7 +1612,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
R* buf_ = (R*)buf.ptr(0);
|
||||
|
||||
sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result = 0;
|
||||
@ -1593,5 +1627,34 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void sum_caller<int>(const DevMem2D, PtrStep, double*);
|
||||
template void sum_caller<float>(const DevMem2D, PtrStep, double*);
|
||||
template void sum_caller<double>(const DevMem2D, PtrStep, double*);
|
||||
|
||||
|
||||
template <typename T>
|
||||
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum)
|
||||
{
|
||||
using namespace sum;
|
||||
typedef typename SumType<T>::R R;
|
||||
|
||||
dim3 threads, grid;
|
||||
estimate_thread_cfg(src.cols, src.rows, threads, grid);
|
||||
set_kernel_consts(src.cols, src.rows, threads, grid);
|
||||
|
||||
R* buf_ = (R*)buf.ptr(0);
|
||||
|
||||
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result = 0;
|
||||
cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
|
||||
sum[0] = result;
|
||||
}
|
||||
|
||||
template void sqsum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*);
|
||||
template void sqsum_caller<double>(const DevMem2D, PtrStep, double*);
|
||||
}}}
|
||||
|
||||
|
@ -940,7 +940,7 @@ struct CV_GpuSumTest: CvTest
|
||||
{
|
||||
Mat src;
|
||||
Scalar a, b;
|
||||
double max_err = 1e-6;
|
||||
double max_err = 1e-5;
|
||||
|
||||
int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;
|
||||
for (int type = CV_8U; type <= typemax; ++type)
|
||||
@ -954,6 +954,19 @@ struct CV_GpuSumTest: CvTest
|
||||
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
|
||||
return;
|
||||
}
|
||||
if (type != CV_8S)
|
||||
{
|
||||
b = sqrSum(GpuMat(src));
|
||||
Mat sqrsrc;
|
||||
multiply(src, src, sqrsrc);
|
||||
a = sum(sqrsrc);
|
||||
if (abs(a[0] - b[0]) > src.size().area() * max_err)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
|
||||
ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (const Exception& e)
|
||||
@ -967,7 +980,7 @@ struct CV_GpuSumTest: CvTest
|
||||
{
|
||||
m.create(rows, cols, type);
|
||||
RNG rng;
|
||||
rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20));
|
||||
rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(16));
|
||||
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user