From 0cd587ee3493afd8ba203296fc172e4c570c0ff9 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 21 Dec 2010 14:02:09 +0000 Subject: [PATCH] added gpu transpose and integral based on NPP Staging. added mask support to SURF_GPU. --- modules/gpu/include/opencv2/gpu/gpu.hpp | 28 +++++---- modules/gpu/src/arithm.cpp | 28 ++++++--- modules/gpu/src/cuda/mathfunc.cu | 38 ------------ modules/gpu/src/cuda/surf.cu | 43 ++++++++++++- modules/gpu/src/imgproc_gpu.cpp | 21 +++++++ modules/gpu/src/surf.cpp | 82 ++++++++++++++----------- tests/gpu/src/imgproc_gpu.cpp | 25 ++------ 7 files changed, 147 insertions(+), 118 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 3e0c0de2c6..c1e35d72c9 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -364,7 +364,7 @@ namespace cv ////////////////////////////// Arithmetics /////////////////////////////////// //! transposes the matrix - //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type + //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst); //! reverses the order of the rows, columns or both in a matrix @@ -594,6 +594,11 @@ namespace cv //! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar()); + //! computes the integral image + //! sum will have CV_32S type, but will contain unsigned int values + //! supports only CV_8UC1 source type + CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum); + //! computes the integral image and integral for the squared image //! sum will have CV_32S type, sqsum - CV32F type //! supports only CV_8UC1 source type @@ -1433,27 +1438,28 @@ namespace cv static void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); //! finds the keypoints using fast hessian detector used in SURF - //! supports CV_8UC1 (0..255) and CV_32FC1 (0..1) images + //! supports CV_8UC1 images //! keypoints will have 1 row and type CV_32FC(6) - //! keypoints.at(1, i) contains i'th keypoint + //! keypoints.at(1, i) contains i'th keypoint //! format: (x, y, size, response, angle, octave) - void operator()(const GpuMat& img, GpuMat& keypoints); + void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); //! finds the keypoints and computes their descriptors. //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction - void operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, + void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, bool useProvidedKeypoints = false, bool calcOrientation = true); - void operator()(const GpuMat& img, std::vector& keypoints); - void operator()(const GpuMat& img, std::vector& keypoints, GpuMat& descriptors, + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints); + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, GpuMat& descriptors, bool useProvidedKeypoints = false, bool calcOrientation = true); - void operator()(const GpuMat& img, std::vector& keypoints, std::vector& descriptors, + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, std::vector& descriptors, bool useProvidedKeypoints = false, bool calcOrientation = true); - GpuMat img_float; - GpuMat img_float_tr; - GpuMat sum; + GpuMat sumf; + + GpuMat mask1; + GpuMat maskSum; GpuMat hessianBuffer; GpuMat maxPosBuffer; diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 9d20a174a8..28acc6a713 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -71,19 +71,13 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, //////////////////////////////////////////////////////////////////////// // transpose -namespace cv { namespace gpu { namespace mathfunc -{ - void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst); -}}} - void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) { - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8SC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 - || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1); + CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8); dst.create( src.cols, src.rows, src.type() ); - if (src.type() == CV_8UC1 || src.type() == CV_8SC1) + if (src.elemSize() == 1) { NppiSize sz; sz.width = src.cols; @@ -91,9 +85,23 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); } - else + else if (src.elemSize() == 4) { - mathfunc::transpose_gpu(src, dst); + NppStSize32u sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( nppiStTranspose_32u_C1R(const_cast(src.ptr()), src.step, + dst.ptr(), dst.step, sz) ); + } + else // if (src.elemSize() == 8) + { + NppStSize32u sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( nppiStTranspose_64u_C1R(const_cast(src.ptr()), src.step, + dst.ptr(), dst.step, sz) ); } } diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 1f8d50c1c9..cd7ee6fa74 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -214,44 +214,6 @@ namespace cv { namespace gpu { namespace mathfunc callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream); } - - -////////////////////////////////////////////////////////////////////////////////////////////////////////// -// transpose - - __global__ void transpose(const DevMem2Di src, PtrStepi dst) - { - __shared__ int s_mem[16 * 17]; - - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y; - - if (y < src.rows && x < src.cols) - { - s_mem[smem_idx] = src.ptr(y)[x]; - } - __syncthreads(); - - smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x; - - x = blockIdx.y * blockDim.x + threadIdx.x; - y = blockIdx.x * blockDim.y + threadIdx.y; - - if (y < src.cols && x < src.rows) - { - dst.ptr(y)[x] = s_mem[smem_idx]; - } - } - - void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst) - { - dim3 threads(16, 16, 1); - dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1); - - transpose<<>>(src, dst); - cudaSafeCall( cudaThreadSynchronize() ); - } }}} diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 29b86ee2bb..a6aef8a35a 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -259,7 +259,36 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // NONMAX + + texture maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); + struct WithOutMask + { + static __device__ bool check(float, float, float) + { + return true; + } + }; + struct WithMask + { + static __device__ bool check(float x, float y, float fscale) + { + float half_width = fscale / 2; + + float result = 0.f; + + result += tex2D(maskSumTex, x - half_width, y - half_width); + result -= tex2D(maskSumTex, x + half_width, y - half_width); + result -= tex2D(maskSumTex, x - half_width, y + half_width); + result += tex2D(maskSumTex, x + half_width, y + half_width); + + result /= (fscale * fscale); + + return (result >= 0.5f); + } + }; + + template __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 @@ -287,7 +316,12 @@ namespace cv { namespace gpu { namespace surf float val = fh_vals[localLin]; - if (inBounds2 && val >= c_threshold) + // Compute the lookup location of the mask center + float x = hidx_x * c_step + c_border; + float y = hidx_y * c_step + c_border; + float fscale = calcScale(hidx_z); + + if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale)) { // Check to see if we have a max (in its 26 neighbours) int zoff = blockDim.x * blockDim.y; @@ -337,7 +371,7 @@ namespace cv { namespace gpu { namespace surf } void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, - int nIntervals, int x_size, int y_size) + int nIntervals, int x_size, int y_size, bool use_mask) { dim3 threads; threads.x = 16; @@ -353,7 +387,10 @@ namespace cv { namespace gpu { namespace surf DeviceReference maxCounterWrapper(maxCounter); - nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); + if (use_mask) + nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); + else + nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); cudaSafeCall( cudaThreadSynchronize() ); } diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 48f77eee1e..88e3e41fc0 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -60,6 +60,7 @@ void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const S void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); } +void cv::gpu::integral(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); } @@ -547,6 +548,26 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d //////////////////////////////////////////////////////////////////////// // integral +void cv::gpu::integral(const GpuMat& src, GpuMat& sum) +{ + CV_Assert(src.type() == CV_8UC1); + + sum.create(src.rows + 1, src.cols + 1, CV_32S); + + NppStSize32u roiSize; + roiSize.width = src.cols; + roiSize.height = src.rows; + + NppSt32u bufSize; + + nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) ); + + GpuMat buffer(1, bufSize, CV_8UC1); + + nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), src.step, + sum.ptr(), sum.step, roiSize, buffer.ptr(), bufSize) ); +} + void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum) { CV_Assert(src.type() == CV_8UC1); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index ab67ff3a82..58f346332c 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -52,11 +52,11 @@ int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;} void cv::gpu::SURF_GPU::uploadKeypoints(const vector&, GpuMat&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector&) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&, GpuMat&, bool, bool) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&, vector&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, GpuMat&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, vector&, bool, bool) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace surf void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size); void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, - int nIntervals, int x_size, int y_size); + int nIntervals, int x_size, int y_size, bool use_mask); void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter); @@ -82,12 +82,12 @@ namespace class SURF_GPU_Invoker : private SURFParams_GPU { public: - SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img) : + SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : SURFParams_GPU(surf), - img_float(surf.img_float), img_float_tr(surf.img_float_tr), + sum(surf.sum), sumf(surf.sumf), - sum(surf.sum), + mask1(surf.mask1), maskSum(surf.maskSum), hessianBuffer(surf.hessianBuffer), maxPosBuffer(surf.maxPosBuffer), @@ -95,11 +95,15 @@ namespace img_cols(img.cols), img_rows(img.rows), + use_mask(!mask.empty()), + mask_width(0), mask_height(0), featureCounter(0), maxCounter(0) { - CV_Assert((img.type() == CV_8UC1 || img.type() == CV_32FC1) && nOctaves > 0 && nIntervals > 2); + CV_Assert(img.type() == CV_8UC1); + CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); + CV_Assert(nOctaves > 0 && nIntervals > 2); CV_Assert(hasAtomicsSupport(getDevice())); max_features = static_cast(img.size().area() * featuresRatio); @@ -139,22 +143,25 @@ namespace hessianBuffer.create(height0 * nIntervals, width0, CV_32F); - if (img.type() == CV_32FC1) - img_float = img; - else - img.convertTo(img_float, CV_32F, 1.0 / 255.0); - - transpose(img_float, img_float_tr); - columnSum(img_float_tr, img_float_tr); - transpose(img_float_tr, sum); - columnSum(sum, sum); + integral(img, sum); + sum.convertTo(sumf, CV_32F, 1.0 / 255.0); - bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sum); + bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf); + + if (!mask.empty()) + { + min(mask, 1.0, mask1); + integral(mask1, maskSum); + + bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum); + } } ~SURF_GPU_Invoker() { unbindTexture("cv::gpu::surf::sumTex"); + if (use_mask) + unbindTexture("cv::gpu::surf::maskSumTex"); } void detectKeypoints(GpuMat& keypoints) @@ -185,7 +192,7 @@ namespace // Reset the candidate count. maxCounter = 0; - nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, nIntervals, x_size, y_size); + nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, nIntervals, x_size, y_size, use_mask); maxCounter = std::min(maxCounter, static_cast(max_candidates)); @@ -214,16 +221,19 @@ namespace } private: - GpuMat& img_float; - GpuMat& img_float_tr; - GpuMat& sum; + GpuMat& sumf; + + GpuMat& mask1; + GpuMat& maskSum; GpuMat& hessianBuffer; GpuMat& maxPosBuffer; GpuMat& featuresBuffer; int img_cols, img_rows; + + bool use_mask; float mask_width, mask_height; @@ -298,19 +308,19 @@ void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, vector descriptorsGPU.download(descriptorsCPU); } -void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints) +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints) { - SURF_GPU_Invoker surf(*this, img); + SURF_GPU_Invoker surf(*this, img, mask); surf.detectKeypoints(keypoints); surf.findOrientation(keypoints); } -void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation) { - SURF_GPU_Invoker surf(*this, img); + SURF_GPU_Invoker surf(*this, img, mask); if (!useProvidedKeypoints) surf.detectKeypoints(keypoints); @@ -321,34 +331,34 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& surf.computeDescriptors(keypoints, descriptors, descriptorSize()); } -void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints) +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector& keypoints) { GpuMat keypointsGPU; - (*this)(img, keypointsGPU); + (*this)(img, mask, keypointsGPU); downloadKeypoints(keypointsGPU, keypoints); } -void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints, bool calcOrientation) +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector& keypoints, + GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation) { GpuMat keypointsGPU; if (useProvidedKeypoints) uploadKeypoints(keypoints, keypointsGPU); - (*this)(img, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation); + (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation); downloadKeypoints(keypointsGPU, keypoints); } -void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints, vector& descriptors, - bool useProvidedKeypoints, bool calcOrientation) +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector& keypoints, + vector& descriptors, bool useProvidedKeypoints, bool calcOrientation) { GpuMat descriptorsGPU; - (*this)(img, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation); + (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation); downloadDescriptors(descriptorsGPU, descriptors); } diff --git a/tests/gpu/src/imgproc_gpu.cpp b/tests/gpu/src/imgproc_gpu.cpp index 53c93e52df..73ba9b1107 100644 --- a/tests/gpu/src/imgproc_gpu.cpp +++ b/tests/gpu/src/imgproc_gpu.cpp @@ -384,29 +384,14 @@ struct CV_GpuNppImageIntegralTest : public CV_GpuImageProcTest return CvTS::OK; } - Mat cpusum, cpusqsum; - cv::integral(img, cpusum, cpusqsum, CV_32S); + Mat cpusum; + cv::integral(img, cpusum, CV_32S); GpuMat gpu1(img); - GpuMat gpusum, gpusqsum; - cv::gpu::integral(gpu1, gpusum, gpusqsum); + GpuMat gpusum; + cv::gpu::integral(gpu1, gpusum); - gpusqsum.convertTo(gpusqsum, CV_64F); - - int test_res = CvTS::OK; - - if (CheckNorm(cpusum, gpusum) != CvTS::OK) - { - ts->printf(CvTS::LOG, "\nSum failed\n"); - test_res = CvTS::FAIL_GENERIC; - } - if (CheckNorm(cpusqsum, gpusqsum) != CvTS::OK) - { - ts->printf(CvTS::LOG, "\nSquared sum failed\n"); - test_res = CvTS::FAIL_GENERIC; - } - - return test_res; + return CheckNorm(cpusum, gpusum) == CvTS::OK ? CvTS::OK : CvTS::FAIL_GENERIC; } };