fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)

This commit is contained in:
Vladislav Vinogradov 2011-02-09 09:11:11 +00:00
parent d03b89f163
commit 924670d32c
4 changed files with 37 additions and 25 deletions

View File

@ -239,17 +239,28 @@ namespace cv { namespace gpu { namespace surf
} }
} }
void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size) dim3 calcBlockSize(int nIntervals)
{ {
dim3 threads; int threadsPerBlock = 512;
threads.x = 16;
threads.y = 8;
threads.z = nIntervals;
dim3 threads;
threads.z = nIntervals;
threadsPerBlock /= nIntervals;
if (threadsPerBlock >= 48)
threads.x = 16;
else
threads.x = 8;
threadsPerBlock /= threads.x;
threads.y = threadsPerBlock;
return threads;
}
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
{
dim3 grid; dim3 grid;
grid.x = divUp(x_size, threads.x); grid.x = divUp(x_size, threads.x);
grid.y = divUp(y_size, threads.y); grid.y = divUp(y_size, threads.y);
grid.z = 1;
fasthessian<<<grid, threads>>>(hessianBuffer); fasthessian<<<grid, threads>>>(hessianBuffer);
@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
} }
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
int nIntervals, int x_size, int y_size, bool use_mask) int x_size, int y_size, bool use_mask, const dim3& threads)
{ {
dim3 threads;
threads.x = 16;
threads.y = 8;
threads.z = nIntervals;
dim3 grid; dim3 grid;
grid.x = divUp(x_size, threads.x - 2); grid.x = divUp(x_size, threads.x - 2);
grid.y = divUp(y_size, threads.y - 2); grid.y = divUp(y_size, threads.y - 2);
grid.z = 1;
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float); const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
dim3 grid; dim3 grid;
grid.x = maxCounter; grid.x = maxCounter;
grid.y = 1;
grid.z = 1;
DeviceReference<unsigned int> featureCounterWrapper(featureCounter); DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
// - SURF says to only use a circle, but the branching logic would slow it down // - SURF says to only use a circle, but the branching logic would slow it down
// - Gaussian weighting should reduce the effects of the outer points anyway // - Gaussian weighting should reduce the effects of the outer points anyway
if (tid2 < 169) if (tid2 < 169)
{ {
dx -= texLookups[threadIdx.x ][threadIdx.y ]; dx -= texLookups[threadIdx.x ][threadIdx.y ];
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
dim3 grid; dim3 grid;
grid.x = nFeatures; grid.x = nFeatures;
grid.y = 1;
grid.z = 1;
find_orientation<<<grid, threads>>>(features); find_orientation<<<grid, threads>>>(features);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );

View File

@ -62,10 +62,12 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
namespace cv { namespace gpu { namespace surf namespace cv { namespace gpu { namespace surf
{ {
void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size); dim3 calcBlockSize(int nIntervals);
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
int nIntervals, int x_size, int y_size, bool use_mask); int x_size, int y_size, bool use_mask, const dim3& threads);
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter); KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
@ -103,7 +105,7 @@ namespace
{ {
CV_Assert(!img.empty() && img.type() == CV_8UC1); CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
CV_Assert(nOctaves > 0 && nIntervals > 2); CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
CV_Assert(DeviceInfo().has(ATOMICS)); CV_Assert(DeviceInfo().has(ATOMICS));
max_features = static_cast<int>(img.size().area() * featuresRatio); max_features = static_cast<int>(img.size().area() * featuresRatio);
@ -168,6 +170,7 @@ namespace
void detectKeypoints(GpuMat& keypoints) void detectKeypoints(GpuMat& keypoints)
{ {
dim3 threads = calcBlockSize(nIntervals);
for(int octave = 0; octave < nOctaves; ++octave) for(int octave = 0; octave < nOctaves; ++octave)
{ {
int step = initialStep * (1 << octave); int step = initialStep * (1 << octave);
@ -189,12 +192,12 @@ namespace
uploadConstant("cv::gpu::surf::c_border", border); uploadConstant("cv::gpu::surf::c_border", border);
uploadConstant("cv::gpu::surf::c_step", step); uploadConstant("cv::gpu::surf::c_step", step);
fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size); fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
// Reset the candidate count. // Reset the candidate count.
maxCounter = 0; maxCounter = 0;
nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads);
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates)); maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));

View File

@ -39,6 +39,9 @@ int main(int argc, char* argv[])
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU); surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU); surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
// matching descriptors // matching descriptors
BruteForceMatcher_GPU< L2<float> > matcher; BruteForceMatcher_GPU< L2<float> > matcher;
GpuMat trainIdx, distance; GpuMat trainIdx, distance;
@ -57,6 +60,8 @@ int main(int argc, char* argv[])
// drawing the results // drawing the results
Mat img_matches; Mat img_matches;
drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches); drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
namedWindow("matches", 0);
imshow("matches", img_matches); imshow("matches", img_matches);
waitKey(0); waitKey(0);

View File

@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
assert(minDist >= 0); assert(minDist >= 0);
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx])) if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
{ {
ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY ); ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
return; return;
} }
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f) if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
{ {
ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY ); ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
return; return;
} }