mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)
This commit is contained in:
parent
d03b89f163
commit
924670d32c
@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
|
|
||||||
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
|
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)
|
dim3 calcBlockSize(int nIntervals)
|
||||||
{
|
{
|
||||||
dim3 threads;
|
int threadsPerBlock = 512;
|
||||||
threads.x = 16;
|
|
||||||
threads.y = 8;
|
dim3 threads;
|
||||||
threads.z = nIntervals;
|
threads.z = nIntervals;
|
||||||
|
threadsPerBlock /= nIntervals;
|
||||||
|
if (threadsPerBlock >= 48)
|
||||||
|
threads.x = 16;
|
||||||
|
else
|
||||||
|
threads.x = 8;
|
||||||
|
threadsPerBlock /= threads.x;
|
||||||
|
threads.y = threadsPerBlock;
|
||||||
|
|
||||||
|
return threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
|
||||||
|
{
|
||||||
dim3 grid;
|
dim3 grid;
|
||||||
grid.x = divUp(x_size, threads.x);
|
grid.x = divUp(x_size, threads.x);
|
||||||
grid.y = divUp(y_size, threads.y);
|
grid.y = divUp(y_size, threads.y);
|
||||||
grid.z = 1;
|
|
||||||
|
|
||||||
fasthessian<<<grid, threads>>>(hessianBuffer);
|
fasthessian<<<grid, threads>>>(hessianBuffer);
|
||||||
|
|
||||||
cudaSafeCall( cudaThreadSynchronize() );
|
cudaSafeCall( cudaThreadSynchronize() );
|
||||||
@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
}
|
}
|
||||||
|
|
||||||
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
|
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
|
||||||
int nIntervals, int x_size, int y_size, bool use_mask)
|
int x_size, int y_size, bool use_mask, const dim3& threads)
|
||||||
{
|
{
|
||||||
dim3 threads;
|
|
||||||
threads.x = 16;
|
|
||||||
threads.y = 8;
|
|
||||||
threads.z = nIntervals;
|
|
||||||
|
|
||||||
dim3 grid;
|
dim3 grid;
|
||||||
grid.x = divUp(x_size, threads.x - 2);
|
grid.x = divUp(x_size, threads.x - 2);
|
||||||
grid.y = divUp(y_size, threads.y - 2);
|
grid.y = divUp(y_size, threads.y - 2);
|
||||||
grid.z = 1;
|
|
||||||
|
|
||||||
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
|
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
|
||||||
|
|
||||||
@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
|
|
||||||
dim3 grid;
|
dim3 grid;
|
||||||
grid.x = maxCounter;
|
grid.x = maxCounter;
|
||||||
grid.y = 1;
|
|
||||||
grid.z = 1;
|
|
||||||
|
|
||||||
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
|
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
|
||||||
|
|
||||||
@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
// - SURF says to only use a circle, but the branching logic would slow it down
|
// - SURF says to only use a circle, but the branching logic would slow it down
|
||||||
// - Gaussian weighting should reduce the effects of the outer points anyway
|
// - Gaussian weighting should reduce the effects of the outer points anyway
|
||||||
if (tid2 < 169)
|
if (tid2 < 169)
|
||||||
|
|
||||||
{
|
{
|
||||||
dx -= texLookups[threadIdx.x ][threadIdx.y ];
|
dx -= texLookups[threadIdx.x ][threadIdx.y ];
|
||||||
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
|
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
|
||||||
@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
|
|
||||||
dim3 grid;
|
dim3 grid;
|
||||||
grid.x = nFeatures;
|
grid.x = nFeatures;
|
||||||
grid.y = 1;
|
|
||||||
grid.z = 1;
|
|
||||||
|
|
||||||
find_orientation<<<grid, threads>>>(features);
|
find_orientation<<<grid, threads>>>(features);
|
||||||
cudaSafeCall( cudaThreadSynchronize() );
|
cudaSafeCall( cudaThreadSynchronize() );
|
||||||
|
@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
|
|||||||
#else /* !defined (HAVE_CUDA) */
|
#else /* !defined (HAVE_CUDA) */
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace surf
|
namespace cv { namespace gpu { namespace surf
|
||||||
{
|
{
|
||||||
void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);
|
dim3 calcBlockSize(int nIntervals);
|
||||||
|
|
||||||
|
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
|
||||||
|
|
||||||
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
|
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
|
||||||
int nIntervals, int x_size, int y_size, bool use_mask);
|
int x_size, int y_size, bool use_mask, const dim3& threads);
|
||||||
|
|
||||||
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
|
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||||
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
|
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
|
||||||
@ -103,7 +105,7 @@ namespace
|
|||||||
{
|
{
|
||||||
CV_Assert(!img.empty() && img.type() == CV_8UC1);
|
CV_Assert(!img.empty() && img.type() == CV_8UC1);
|
||||||
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
|
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
|
||||||
CV_Assert(nOctaves > 0 && nIntervals > 2);
|
CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
|
||||||
CV_Assert(DeviceInfo().has(ATOMICS));
|
CV_Assert(DeviceInfo().has(ATOMICS));
|
||||||
|
|
||||||
max_features = static_cast<int>(img.size().area() * featuresRatio);
|
max_features = static_cast<int>(img.size().area() * featuresRatio);
|
||||||
@ -168,6 +170,7 @@ namespace
|
|||||||
|
|
||||||
void detectKeypoints(GpuMat& keypoints)
|
void detectKeypoints(GpuMat& keypoints)
|
||||||
{
|
{
|
||||||
|
dim3 threads = calcBlockSize(nIntervals);
|
||||||
for(int octave = 0; octave < nOctaves; ++octave)
|
for(int octave = 0; octave < nOctaves; ++octave)
|
||||||
{
|
{
|
||||||
int step = initialStep * (1 << octave);
|
int step = initialStep * (1 << octave);
|
||||||
@ -189,12 +192,12 @@ namespace
|
|||||||
uploadConstant("cv::gpu::surf::c_border", border);
|
uploadConstant("cv::gpu::surf::c_border", border);
|
||||||
uploadConstant("cv::gpu::surf::c_step", step);
|
uploadConstant("cv::gpu::surf::c_step", step);
|
||||||
|
|
||||||
fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);
|
fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
|
||||||
|
|
||||||
// Reset the candidate count.
|
// Reset the candidate count.
|
||||||
maxCounter = 0;
|
maxCounter = 0;
|
||||||
|
|
||||||
nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask);
|
nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads);
|
||||||
|
|
||||||
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
|
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
|
||||||
|
|
||||||
|
@ -38,6 +38,9 @@ int main(int argc, char* argv[])
|
|||||||
GpuMat descriptors1GPU, descriptors2GPU;
|
GpuMat descriptors1GPU, descriptors2GPU;
|
||||||
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
|
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
|
||||||
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
|
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
|
||||||
|
|
||||||
|
cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
|
||||||
|
cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
|
||||||
|
|
||||||
// matching descriptors
|
// matching descriptors
|
||||||
BruteForceMatcher_GPU< L2<float> > matcher;
|
BruteForceMatcher_GPU< L2<float> > matcher;
|
||||||
@ -57,6 +60,8 @@ int main(int argc, char* argv[])
|
|||||||
// drawing the results
|
// drawing the results
|
||||||
Mat img_matches;
|
Mat img_matches;
|
||||||
drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
|
drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
|
||||||
|
|
||||||
|
namedWindow("matches", 0);
|
||||||
imshow("matches", img_matches);
|
imshow("matches", img_matches);
|
||||||
waitKey(0);
|
waitKey(0);
|
||||||
|
|
||||||
|
@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
|
|||||||
assert(minDist >= 0);
|
assert(minDist >= 0);
|
||||||
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
|
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
|
||||||
{
|
{
|
||||||
|
ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");
|
||||||
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
|
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
|
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
|
||||||
{
|
{
|
||||||
|
ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
|
||||||
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
|
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user