fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)

2025-01-18 06:03:15 +08:00 · 2011-02-09 09:11:11 +00:00 · 2011-02-09 09:11:11 +00:00 · 924670d32c
commit 924670d32c
parent d03b89f163
4 changed files with 37 additions and 25 deletions
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -239,17 +239,28 @@ namespace cv { namespace gpu { namespace surf
        }
    }
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)
+    dim3 calcBlockSize(int nIntervals)
    {
-        dim3 threads;
+        int threadsPerBlock = 512;
        threads.x = 16;
        threads.y = 8;
        threads.z = nIntervals;
        dim3 threads;        
        threads.z = nIntervals;
        threadsPerBlock /= nIntervals;
        if (threadsPerBlock >= 48)
            threads.x = 16;
        else
            threads.x = 8;
        threadsPerBlock /= threads.x;
        threads.y = threadsPerBlock;
        return threads;
    }
    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
    {
        dim3 grid;
        grid.x = divUp(x_size, threads.x);
        grid.y = divUp(y_size, threads.y);
        grid.z = 1;
  	    fasthessian<<<grid, threads>>>(hessianBuffer);
@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
    }
    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask)
+        int x_size, int y_size, bool use_mask, const dim3& threads)
    {
        dim3 threads;
        threads.x = 16;
        threads.y = 8;
        threads.z = nIntervals;
        dim3 grid;
        grid.x = divUp(x_size, threads.x - 2);
        grid.y = divUp(y_size, threads.y - 2);
        grid.z = 1;
        const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
        dim3 grid;
        grid.x = maxCounter;
        grid.y = 1; 
        grid.z = 1;
        DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
 	    // - SURF says to only use a circle, but the branching logic would slow it down
 	    // - Gaussian weighting should reduce the effects of the outer points anyway
        if (tid2 < 169)
        {
 	        dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
 	        dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
        dim3 grid;
        grid.x = nFeatures;
        grid.y = 1;
        grid.z = 1;
        find_orientation<<<grid, threads>>>(features);
        cudaSafeCall( cudaThreadSynchronize() );
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -62,10 +62,12 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
 namespace cv { namespace gpu { namespace surf
 {
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);
+    dim3 calcBlockSize(int nIntervals);
    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask);
+        int x_size, int y_size, bool use_mask, const dim3& threads);
    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
@ -103,7 +105,7 @@ namespace
        {
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-            CV_Assert(nOctaves > 0 && nIntervals > 2);
+            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
            CV_Assert(DeviceInfo().has(ATOMICS));
            max_features = static_cast<int>(img.size().area() * featuresRatio);
@ -168,6 +170,7 @@ namespace
        void detectKeypoints(GpuMat& keypoints)
        {
            dim3 threads = calcBlockSize(nIntervals);
            for(int octave = 0; octave < nOctaves; ++octave)
            {
                int step = initialStep * (1 << octave);
@ -189,12 +192,12 @@ namespace
                uploadConstant("cv::gpu::surf::c_border", border);
                uploadConstant("cv::gpu::surf::c_step",   step);
-                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);
+                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
                // Reset the candidate count.
                maxCounter = 0;
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); 
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); 
                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@ -39,6 +39,9 @@ int main(int argc, char* argv[])
    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
    // matching descriptors
    BruteForceMatcher_GPU< L2<float> > matcher;
    GpuMat trainIdx, distance;
@ -57,6 +60,8 @@ int main(int argc, char* argv[])
    // drawing the results
    Mat img_matches;
    drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
    namedWindow("matches", 0);
    imshow("matches", img_matches);
    waitKey(0);
--- a/tests/gpu/src/features2d.cpp
+++ b/tests/gpu/src/features2d.cpp
@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
        assert(minDist >= 0);
        if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
        {
            ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");
            ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
            return;
        }
        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
        {
            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
            ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
            return;
        }