diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 3c7dcc7d71..642deeeaad 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -756,12 +756,6 @@ namespace cv //! computes the proximity map for the raster template and the image where the template is searched for CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method); - //! downsamples image - CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); - - //! upsamples image - CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null()); - //! smoothes the source image and downsamples it CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp index 76177a060a..d40f070016 100644 --- a/modules/gpu/perf/perf_filters.cpp +++ b/modules/gpu/perf/perf_filters.cpp @@ -3,7 +3,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing::ValuesIn(devices()), testing::Values(GPU_TYPICAL_MAT_SIZES), testing::Values(CV_8UC1, CV_8UC4), - testing::Values(3, 5, 7))) + testing::Values(3, 5))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); Size size = std::tr1::get<1>(GetParam()); @@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing:: testing::Values(GPU_TYPICAL_MAT_SIZES), testing::Values(CV_8UC1, CV_8UC4), testing::Values((int)MORPH_ERODE, (int)MORPH_DILATE), - testing::Values(3, 5, 7))) + testing::Values(3, 5))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); Size size = std::tr1::get<1>(GetParam()); @@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing:: PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(testing::ValuesIn(devices()), testing::Values(GPU_TYPICAL_MAT_SIZES), testing::Values(CV_8UC1, CV_8UC4), - testing::Values(3, 5, 7))) + testing::Values(3, 5))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); Size size = std::tr1::get<1>(GetParam()); @@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), testing::Values(GPU_TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC3, CV_32FC1), - testing::Values(3, 5, 7), + testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1), + testing::Values(3, 5), testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index f4d63d6dd6..fb72043da7 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn( } PERF_TEST_P(DevInfo_Size_MatType_Interpolation_SizeCoeff, resize, testing::Combine(testing::ValuesIn(devices()), - testing::Values(GPU_TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4), + testing::Values(szSXGA, sz1080p), + testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1), testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC), testing::Values(0.5, 2.0))) { diff --git a/modules/gpu/perf/perf_utility.hpp b/modules/gpu/perf/perf_utility.hpp index 883bd4a392..ec6b052fe7 100644 --- a/modules/gpu/perf/perf_utility.hpp +++ b/modules/gpu/perf/perf_utility.hpp @@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple > DevInfo_K_Des const cv::Size sz1800x1500 = cv::Size(1800, 1500); const cv::Size sz4700x3000 = cv::Size(4700, 3000); -#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000 +//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000 +#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000 //! read image from testdata folder. Mat readImage(const string& fileName, int flags = CV_LOAD_IMAGE_COLOR); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index ab24cb0f7e..c56dcda0df 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, static const match_caller_t match_callers[3][8] = { { - matchSingleL1_gpu, matchSingleL1_gpu, + matchSingleL1_gpu, 0/*matchSingleL1_gpu*/, matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, 0, 0 }, { - matchSingleL2_gpu, matchSingleL2_gpu, - matchSingleL2_gpu, matchSingleL2_gpu, - matchSingleL2_gpu, matchSingleL2_gpu, 0, 0 + 0/*matchSingleL2_gpu*/, 0/*matchSingleL2_gpu*/, + 0/*matchSingleL2_gpu*/, 0/*matchSingleL2_gpu*/, + 0/*matchSingleL2_gpu*/, matchSingleL2_gpu, 0, 0 }, { - matchSingleHamming_gpu, matchSingleHamming_gpu, - matchSingleHamming_gpu, matchSingleHamming_gpu, + matchSingleHamming_gpu, 0/*matchSingleHamming_gpu*/, + matchSingleHamming_gpu, 0/*matchSingleHamming_gpu*/, matchSingleHamming_gpu, 0, 0, 0 } }; @@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes static const match_caller_t match_callers[3][8] = { { - matchCollectionL1_gpu, matchCollectionL1_gpu, + matchCollectionL1_gpu, 0/*matchCollectionL1_gpu*/, matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, 0, 0 }, { - matchCollectionL2_gpu, matchCollectionL2_gpu, - matchCollectionL2_gpu, matchCollectionL2_gpu, - matchCollectionL2_gpu, matchCollectionL2_gpu, 0, 0 + 0/*matchCollectionL2_gpu*/, 0/*matchCollectionL2_gpu*/, + 0/*matchCollectionL2_gpu*/, 0/*matchCollectionL2_gpu*/, + 0/*matchCollectionL2_gpu*/, matchCollectionL2_gpu, 0, 0 }, { - matchCollectionHamming_gpu, matchCollectionHamming_gpu, - matchCollectionHamming_gpu, matchCollectionHamming_gpu, + matchCollectionHamming_gpu, 0/*matchCollectionHamming_gpu*/, + matchCollectionHamming_gpu, 0/*matchCollectionHamming_gpu*/, matchCollectionHamming_gpu, 0, 0, 0 } }; @@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con static const match_caller_t match_callers[3][8] = { { - knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, + knnMatchL1_gpu, 0/*knnMatchL1_gpu*/, knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, 0, 0 }, { - knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, - knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, 0, 0 + 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, + 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, knnMatchL2_gpu, 0, 0 }, { - knnMatchHamming_gpu, knnMatchHamming_gpu, knnMatchHamming_gpu, - knnMatchHamming_gpu, knnMatchHamming_gpu, 0, 0, 0 + knnMatchHamming_gpu, 0/*knnMatchHamming_gpu*/, knnMatchHamming_gpu, + 0/*knnMatchHamming_gpu*/, knnMatchHamming_gpu, 0, 0, 0 } }; @@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, static const radiusMatch_caller_t radiusMatch_callers[3][8] = { { - radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, + radiusMatchL1_gpu, 0/*radiusMatchL1_gpu*/, radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, 0, 0 }, { - radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, - radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, 0, 0 + 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, + 0/*radiusMatchL2_gpu*/, 0/*radiusMatchL2_gpu*/, radiusMatchL2_gpu, 0, 0 }, { - radiusMatchHamming_gpu, radiusMatchHamming_gpu, radiusMatchHamming_gpu, - radiusMatchHamming_gpu, radiusMatchHamming_gpu, 0, 0, 0 + radiusMatchHamming_gpu, 0/*radiusMatchHamming_gpu*/, radiusMatchHamming_gpu, + 0/*radiusMatchHamming_gpu*/, radiusMatchHamming_gpu, 0, 0, 0 } }; diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu new file mode 100644 index 0000000000..079251e485 --- /dev/null +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -0,0 +1,464 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/vec_distance.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace cv { namespace gpu { namespace bfmatcher +{ + template + __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, + typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, + typename Dist::result_type* smem) + { + const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); + + typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; + + distMin1 = numeric_limits::max(); + distMin2 = numeric_limits::max(); + + bestTrainIdx1 = -1; + bestTrainIdx2 = -1; + + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) + { + if (m(queryIdx, trainIdx)) + { + Dist dist; + + const T* trainRow = train.ptr(trainIdx); + + vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); + + const typename Dist::result_type val = dist; + + if (val < distMin1) + { + distMin1 = val; + bestTrainIdx1 = trainIdx; + } + else if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = trainIdx; + } + } + } + } + + template + __global__ void knnMatch2(const PtrStep_ query, const DevMem2D_ train, const Mask m, int2* trainIdx, float2* distance) + { + typedef typename Dist::result_type result_type; + typedef typename Dist::value_type value_type; + + __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + + const int queryIdx = blockIdx.x; + + result_type distMin1; + result_type distMin2; + + int bestTrainIdx1; + int bestTrainIdx2; + + distanceCalcLoop(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); + __syncthreads(); + + volatile result_type* sdistMinRow = smem; + volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); + + if (threadIdx.x == 0) + { + sdistMinRow[threadIdx.y] = distMin1; + sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; + + sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; + sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; + } + __syncthreads(); + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + distMin1 = numeric_limits::max(); + distMin2 = numeric_limits::max(); + + bestTrainIdx1 = -1; + bestTrainIdx2 = -1; + + #pragma unroll + for (int i = 0; i < BLOCK_DIM_Y; ++i) + { + result_type val = sdistMinRow[i]; + + if (val < distMin1) + { + distMin1 = val; + bestTrainIdx1 = sbestTrainIdxRow[i]; + } + else if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = sbestTrainIdxRow[i]; + } + } + + #pragma unroll + for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) + { + result_type val = sdistMinRow[i]; + + if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = sbestTrainIdxRow[i]; + } + } + + trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2); + distance[queryIdx] = make_float2(distMin1, distMin2); + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Knn 2 Match kernel caller + + template + void knnMatch2Simple_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + knnMatch2, Dist, T> + <<>>(query, train, mask, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void knnMatch2Cached_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX + + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + knnMatch2, Dist, T> + <<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Knn 2 Match Dispatcher + + template + void knnMatch2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (query.cols < 64) + { + knnMatch2Cached_caller<16, 16, 64, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols == 64) + { + knnMatch2Cached_caller<16, 16, 64, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols < 128) + { + knnMatch2Cached_caller<16, 16, 128, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols == 128 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 128, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols < 256 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 256, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols == 256 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 256, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else + { + knnMatch2Simple_caller<16, 16, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Calc distance kernel + + template + __global__ void calcDistance(const PtrStep_ query, const DevMem2D_ train, const Mask mask, PtrStepf distance) + { + __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; + + typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; + + const int queryIdx = blockIdx.x; + const T* queryDescs = query.ptr(queryIdx); + + const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + + if (trainIdx < train.rows) + { + const T* trainDescs = train.ptr(trainIdx); + + typename Dist::result_type myDist = numeric_limits::max(); + + if (mask(queryIdx, trainIdx)) + { + Dist dist; + + calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); + + myDist = dist; + } + + if (threadIdx.x == 0) + distance.ptr(queryIdx)[trainIdx] = myDist; + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Calc distance kernel caller + + template + void calcDistance_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) + { + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); + + calcDistance<<>>(query, train, mask, distance); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) + { + calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast(allDist), stream); + } + + /////////////////////////////////////////////////////////////////////////////// + // find knn match kernel + + template __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) + { + const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; + __shared__ float sdist[SMEM_SIZE]; + __shared__ int strainIdx[SMEM_SIZE]; + + const int queryIdx = blockIdx.x; + + float* allDist = allDist_.ptr(queryIdx); + int* trainIdx = trainIdx_.ptr(queryIdx); + float* distance = distance_.ptr(queryIdx); + + float dist = numeric_limits::max(); + int bestIdx = -1; + + for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) + { + float reg = allDist[i]; + if (reg < dist) + { + dist = reg; + bestIdx = i; + } + } + + sdist[threadIdx.x] = dist; + strainIdx[threadIdx.x] = bestIdx; + __syncthreads(); + + reducePredVal(sdist, dist, strainIdx, bestIdx, threadIdx.x, less()); + + if (threadIdx.x == 0) + { + if (dist < numeric_limits::max()) + { + allDist[bestIdx] = numeric_limits::max(); + trainIdx[i] = bestIdx; + distance[i] = dist; + } + } + } + + /////////////////////////////////////////////////////////////////////////////// + // find knn match kernel caller + + template void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + { + const dim3 threads(BLOCK_SIZE, 1, 1); + const dim3 grid(trainIdx.rows, 1, 1); + + for (int i = 0; i < k; ++i) + { + findBestMatch<<>>(allDist, i, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) + { + findKnnMatch_caller<256>(k, static_cast(trainIdx), static_cast(distance), static_cast(allDist), stream); + } + + /////////////////////////////////////////////////////////////////////////////// + // knn match Dispatcher + + template + void knnMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) + { + if (mask.data) + { + if (k == 2) + { + knnMatch2Dispatcher(query, train, SingleMask(mask), trainIdx, distance, cc, stream); + return; + } + + calcDistanceDispatcher(query, train, SingleMask(mask), allDist, stream); + } + else + { + if (k == 2) + { + knnMatch2Dispatcher(query, train, WithOutMask(), trainIdx, distance, cc, stream); + return; + } + + calcDistanceDispatcher(query, train, WithOutMask(), allDist, stream); + } + + findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); + } + + /////////////////////////////////////////////////////////////////////////////// + // knn match caller + + template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) + { + knnMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + } + + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + + template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) + { + knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + } + + //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + + template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) + { + knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + } + + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); +}}} diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu new file mode 100644 index 0000000000..83f48b4d3c --- /dev/null +++ b/modules/gpu/src/cuda/bf_match.cu @@ -0,0 +1,403 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/vec_distance.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace cv { namespace gpu { namespace bfmatcher +{ + template + __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) + { + if (threadIdx.x == 0) + { + smin[threadIdx.y] = myDist; + sIdx[threadIdx.y] = myIdx; + } + __syncthreads(); + + reducePredVal(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less()); + } + + template + __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& train, const Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) + { + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) + { + if (m(queryIdx, trainIdx)) + { + const T* trainDescs = train.ptr(trainIdx); + + Dist dist; + + vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); + + const typename Dist::result_type res = dist; + + if (res < myDist) + { + myDist = res; + myIdx.x = trainIdx; + myIdx.y = imgIdx; + } + } + } + } + + template struct SingleTrain + { + explicit SingleTrain(const DevMem2D_& train_) : train(train_) + { + } + + template + __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const + { + matchDescs(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); + } + + __device__ __forceinline__ int desc_len() const + { + return train.cols; + } + + static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, + float myDist, const int2& myIdx, int queryIdx) + { + trainIdx[queryIdx] = myIdx.x; + distance[queryIdx] = myDist; + } + + const DevMem2D_ train; + }; + + template struct TrainCollection + { + TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : + trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) + { + } + + template + __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const + { + for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) + { + const DevMem2D_ train = trainCollection[imgIdx]; + m.next(); + matchDescs(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); + } + } + + __device__ __forceinline__ int desc_len() const + { + return desclen; + } + + static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, + float myDist, const int2& myIdx, int queryIdx) + { + trainIdx[queryIdx] = myIdx.x; + imgIdx[queryIdx] = myIdx.y; + distance[queryIdx] = myDist; + } + + const DevMem2D_* trainCollection; + const int nImg; + const int desclen; + }; + + template + __device__ void distanceCalcLoop(const PtrStep_& query, const Train& train, const Mask& mask, int queryIdx, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) + { + const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); + + typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; + + Mask m = mask; + + myIdx.x = -1; + myIdx.y = -1; + myDist = numeric_limits::max(); + + train.template loop(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); + } + + template + __global__ void match(const PtrStep_ query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) + { + __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + + const int queryIdx = blockIdx.x; + + int2 myIdx; + typename Dist::result_type myDist; + + distanceCalcLoop(query, train, mask, queryIdx, myDist, myIdx, smem); + __syncthreads(); + + typename Dist::result_type* smin = smem; + int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); + + findBestMatch(myDist, myIdx, smin, sIdx); + + if (threadIdx.x == 0 && threadIdx.y == 0) + Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match kernel caller + + template + void matchSimple_caller(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + StaticAssert::check(); // blockDimY vals must reduce by warp + + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + match, Dist, T> + <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void matchCached_caller(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + StaticAssert::check(); // blockDimY vals must reduce by warp + StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX + + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + match, Dist, T> + <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match Dispatcher + + template + void matchDispatcher(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (query.cols < 64) + { + matchCached_caller<16, 16, 64, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 64) + { + matchCached_caller<16, 16, 64, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols < 128) + { + matchCached_caller<16, 16, 128, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 128 && cc >= 12) + { + matchCached_caller<16, 16, 128, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols < 256 && cc >= 12) + { + matchCached_caller<16, 16, 256, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 256 && cc >= 12) + { + matchCached_caller<16, 16, 256, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else + { + matchSimple_caller<16, 16, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Match caller + + template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + else + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + } + + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + } + + //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + SingleTrain train(static_cast< DevMem2D_ >(train_)); + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + } + + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + if (maskCollection.data) + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + else + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + if (maskCollection.data) + matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); + if (maskCollection.data) + matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); +}}} diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu new file mode 100644 index 0000000000..5dc8fabd9d --- /dev/null +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -0,0 +1,202 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/vec_distance.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace cv { namespace gpu { namespace bfmatcher +{ + template + __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, float maxDistance, const Mask mask, + DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) + { + #if __CUDA_ARCH__ >= 110 + + __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + + typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + + const int queryIdx = blockIdx.x; + const T* queryDescs = query.ptr(queryIdx); + + const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + + if (trainIdx < train.rows) + { + const T* trainDescs = train.ptr(trainIdx); + + if (mask(queryIdx, trainIdx)) + { + Dist dist; + + calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); + + if (threadIdx.x == 0) + { + if (dist < maxDistance) + { + unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (i < trainIdx_.cols) + { + distance.ptr(queryIdx)[i] = dist; + trainIdx_.ptr(queryIdx)[i] = trainIdx; + } + } + } + } + } + + #endif + } + + /////////////////////////////////////////////////////////////////////////////// + // Radius Match kernel caller + + template + void radiusMatch_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2D_& nMatches, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); + + radiusMatch<<>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Radius Match Dispatcher + + template + void radiusMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) + { + radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, + static_cast(trainIdx), static_cast< const DevMem2D_ >(nMatches), static_cast(distance), + stream); + } + + /////////////////////////////////////////////////////////////////////////////// + // Radius Match caller + + template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) + { + if (mask.data) + { + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); + } + else + { + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); + } + } + + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + + template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) + { + if (mask.data) + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); + } + else + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); + } + } + + //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + + template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) + { + if (mask.data) + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); + } + else + { + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); + } + } + + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + //template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); +}}} diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu new file mode 100644 index 0000000000..ae5d801486 --- /dev/null +++ b/modules/gpu/src/cuda/bilateral_filter.cu @@ -0,0 +1,233 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/limits.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace bf_krnls +{ + __constant__ float* ctable_color; + __constant__ float* ctable_space; + __constant__ size_t ctable_space_step; + + __constant__ int cndisp; + __constant__ int cradius; + + __constant__ short cedge_disc; + __constant__ short cmax_disc; +} + +namespace cv { namespace gpu { namespace bf +{ + void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) + { + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); + size_t table_space_step = table_space.step / sizeof(float); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); + + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); + + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); + cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); + } +}}} + +namespace bf_krnls +{ + template + struct DistRgbMax + { + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) + { + uchar x = abs(a[0] - b[0]); + uchar y = abs(a[1] - b[1]); + uchar z = abs(a[2] - b[2]); + return (max(max(x, y), z)); + } + }; + + template <> + struct DistRgbMax<1> + { + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) + { + return abs(a[0] - b[0]); + } + }; + + template + __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); + + T dp[5]; + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); + + if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) + { + const int ymin = max(0, y - cradius); + const int xmin = max(0, x - cradius); + const int ymax = min(h - 1, y + cradius); + const int xmax = min(w - 1, x + cradius); + + float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + const uchar* ic = img + y * img_step + channels * x; + + for(int yi = ymin; yi <= ymax; yi++) + { + const T* disp_y = disp + yi * disp_step; + + for(int xi = xmin; xi <= xmax; xi++) + { + const uchar* in = img + yi * img_step + channels * xi; + + uchar dist_rgb = DistRgbMax::calc(in, ic); + + const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; + + const T disp_reg = disp_y[xi]; + + cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; + cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; + cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; + cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; + cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; + } + } + + float minimum = numeric_limits::max(); + int id = 0; + + if (cost[0] < minimum) + { + minimum = cost[0]; + id = 0; + } + if (cost[1] < minimum) + { + minimum = cost[1]; + id = 1; + } + if (cost[2] < minimum) + { + minimum = cost[2]; + id = 2; + } + if (cost[3] < minimum) + { + minimum = cost[3]; + id = 3; + } + if (cost[4] < minimum) + { + minimum = cost[4]; + id = 4; + } + + *(disp + y * disp_step + x) = dp[id]; + } + } + } +} + +namespace cv { namespace gpu { namespace bf +{ + template + void bilateral_filter_caller(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + dim3 threads(32, 8, 1); + dim3 grid(1, 1, 1); + grid.x = divUp(disp.cols, threads.x << 1); + grid.y = divUp(disp.rows, threads.y); + + switch (channels) + { + case 1: + for (int i = 0; i < iters; ++i) + { + bf_krnls::bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + bf_krnls::bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + } + break; + case 3: + for (int i = 0; i < iters; ++i) + { + bf_krnls::bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + bf_krnls::bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + } + break; + default: + cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); + } + + if (stream != 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + bilateral_filter_caller(disp, img, channels, iters, stream); + } + + void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) + { + bilateral_filter_caller(disp, img, channels, iters, stream); + } +}}} diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu deleted file mode 100644 index f5afda75b5..0000000000 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ /dev/null @@ -1,980 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or bpied warranties, including, but not limited to, the bpied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/vec_distance.hpp" - -using namespace cv::gpu; -using namespace cv::gpu::device; - -namespace cv { namespace gpu { namespace bfmatcher -{ - -/////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////// Match ////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - - template - __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) - { - if (threadIdx.x == 0) - { - smin[threadIdx.y] = myDist; - sIdx[threadIdx.y] = myIdx; - } - __syncthreads(); - - reducePredVal(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less()); - } - - template - __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& train, const Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) - { - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) - { - if (m(queryIdx, trainIdx)) - { - const T* trainDescs = train.ptr(trainIdx); - - Dist dist; - - vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - - const typename Dist::result_type res = dist; - - if (res < myDist) - { - myDist = res; - myIdx.x = trainIdx; - myIdx.y = imgIdx; - } - } - } - } - - template struct SingleTrain - { - explicit SingleTrain(const DevMem2D_& train_) : train(train_) - { - } - - template - __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const - { - matchDescs(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); - } - - __device__ __forceinline__ int desc_len() const - { - return train.cols; - } - - static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, - float myDist, const int2& myIdx, int queryIdx) - { - trainIdx[queryIdx] = myIdx.x; - distance[queryIdx] = myDist; - } - - const DevMem2D_ train; - }; - - template struct TrainCollection - { - TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : - trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) - { - } - - template - __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const - { - for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) - { - const DevMem2D_ train = trainCollection[imgIdx]; - m.next(); - matchDescs(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); - } - } - - __device__ __forceinline__ int desc_len() const - { - return desclen; - } - - static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, - float myDist, const int2& myIdx, int queryIdx) - { - trainIdx[queryIdx] = myIdx.x; - imgIdx[queryIdx] = myIdx.y; - distance[queryIdx] = myDist; - } - - const DevMem2D_* trainCollection; - const int nImg; - const int desclen; - }; - - template - __device__ void distanceCalcLoop(const PtrStep_& query, const Train& train, const Mask& mask, int queryIdx, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) - { - const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); - - typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; - - Mask m = mask; - - myIdx.x = -1; - myIdx.y = -1; - myDist = numeric_limits::max(); - - train.template loop(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); - } - - template - __global__ void match(const PtrStep_ query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) - { - __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - const int queryIdx = blockIdx.x; - - int2 myIdx; - typename Dist::result_type myDist; - - distanceCalcLoop(query, train, mask, queryIdx, myDist, myIdx, smem); - __syncthreads(); - - typename Dist::result_type* smin = smem; - int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); - - findBestMatch(myDist, myIdx, smin, sIdx); - - if (threadIdx.x == 0 && threadIdx.y == 0) - Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); - } - - /////////////////////////////////////////////////////////////////////////////// - // Match kernel caller - - template - void matchSimple_caller(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - StaticAssert::check(); // blockDimY vals must reduce by warp - - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - - match, Dist, T> - <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void matchCached_caller(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - StaticAssert::check(); // blockDimY vals must reduce by warp - StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX - - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - - match, Dist, T> - <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - /////////////////////////////////////////////////////////////////////////////// - // Match Dispatcher - - template - void matchDispatcher(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - if (query.cols < 64) - { - matchCached_caller<16, 16, 64, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else if (query.cols == 64) - { - matchCached_caller<16, 16, 64, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else if (query.cols < 128) - { - matchCached_caller<16, 16, 128, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else if (query.cols == 128 && cc >= 12) - { - matchCached_caller<16, 16, 128, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else if (query.cols < 256 && cc >= 12) - { - matchCached_caller<16, 16, 256, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else if (query.cols == 256 && cc >= 12) - { - matchCached_caller<16, 16, 256, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - else - { - matchSimple_caller<16, 16, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); - } - } - - /////////////////////////////////////////////////////////////////////////////// - // Match caller - - template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); - else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); - } - - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - - template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); - } - - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - - template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); - } - - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - - template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); - else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); - } - - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - - template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); - } - - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - - template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); - } - - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - -/////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////// Knn Match //////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - - template - __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, - typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, - typename Dist::result_type* smem) - { - const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); - - typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; - - distMin1 = numeric_limits::max(); - distMin2 = numeric_limits::max(); - - bestTrainIdx1 = -1; - bestTrainIdx2 = -1; - - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) - { - if (m(queryIdx, trainIdx)) - { - Dist dist; - - const T* trainRow = train.ptr(trainIdx); - - vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); - - const typename Dist::result_type val = dist; - - if (val < distMin1) - { - distMin1 = val; - bestTrainIdx1 = trainIdx; - } - else if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = trainIdx; - } - } - } - } - - template - __global__ void knnMatch2(const PtrStep_ query, const DevMem2D_ train, const Mask m, int2* trainIdx, float2* distance) - { - typedef typename Dist::result_type result_type; - typedef typename Dist::value_type value_type; - - __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - const int queryIdx = blockIdx.x; - - result_type distMin1; - result_type distMin2; - - int bestTrainIdx1; - int bestTrainIdx2; - - distanceCalcLoop(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); - __syncthreads(); - - volatile result_type* sdistMinRow = smem; - volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); - - if (threadIdx.x == 0) - { - sdistMinRow[threadIdx.y] = distMin1; - sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; - - sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; - sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; - } - __syncthreads(); - - if (threadIdx.x == 0 && threadIdx.y == 0) - { - distMin1 = numeric_limits::max(); - distMin2 = numeric_limits::max(); - - bestTrainIdx1 = -1; - bestTrainIdx2 = -1; - - #pragma unroll - for (int i = 0; i < BLOCK_DIM_Y; ++i) - { - result_type val = sdistMinRow[i]; - - if (val < distMin1) - { - distMin1 = val; - bestTrainIdx1 = sbestTrainIdxRow[i]; - } - else if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = sbestTrainIdxRow[i]; - } - } - - #pragma unroll - for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) - { - result_type val = sdistMinRow[i]; - - if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = sbestTrainIdxRow[i]; - } - } - - trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2); - distance[queryIdx] = make_float2(distMin1, distMin2); - } - } - - /////////////////////////////////////////////////////////////////////////////// - // Knn 2 Match kernel caller - - template - void knnMatch2Simple_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - - knnMatch2, Dist, T> - <<>>(query, train, mask, trainIdx, distance); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void knnMatch2Cached_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX - - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - - knnMatch2, Dist, T> - <<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - /////////////////////////////////////////////////////////////////////////////// - // Knn 2 Match Dispatcher - - template - void knnMatch2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) - { - if (query.cols < 64) - { - knnMatch2Cached_caller<16, 16, 64, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else if (query.cols == 64) - { - knnMatch2Cached_caller<16, 16, 64, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else if (query.cols < 128) - { - knnMatch2Cached_caller<16, 16, 128, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else if (query.cols == 128 && cc >= 12) - { - knnMatch2Cached_caller<16, 16, 128, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else if (query.cols < 256 && cc >= 12) - { - knnMatch2Cached_caller<16, 16, 256, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else if (query.cols == 256 && cc >= 12) - { - knnMatch2Cached_caller<16, 16, 256, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - else - { - knnMatch2Simple_caller<16, 16, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); - } - } - - /////////////////////////////////////////////////////////////////////////////// - // Calc distance kernel - - template - __global__ void calcDistance(const PtrStep_ query, const DevMem2D_ train, const Mask mask, PtrStepf distance) - { - __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; - - typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; - - const int queryIdx = blockIdx.x; - const T* queryDescs = query.ptr(queryIdx); - - const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - - if (trainIdx < train.rows) - { - const T* trainDescs = train.ptr(trainIdx); - - typename Dist::result_type myDist = numeric_limits::max(); - - if (mask(queryIdx, trainIdx)) - { - Dist dist; - - calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - - myDist = dist; - } - - if (threadIdx.x == 0) - distance.ptr(queryIdx)[trainIdx] = myDist; - } - } - - /////////////////////////////////////////////////////////////////////////////// - // Calc distance kernel caller - - template - void calcDistance_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) - { - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); - - calcDistance<<>>(query, train, mask, distance); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) - { - calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast(allDist), stream); - } - - /////////////////////////////////////////////////////////////////////////////// - // find knn match kernel - - template __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) - { - const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; - __shared__ float sdist[SMEM_SIZE]; - __shared__ int strainIdx[SMEM_SIZE]; - - const int queryIdx = blockIdx.x; - - float* allDist = allDist_.ptr(queryIdx); - int* trainIdx = trainIdx_.ptr(queryIdx); - float* distance = distance_.ptr(queryIdx); - - float dist = numeric_limits::max(); - int bestIdx = -1; - - for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) - { - float reg = allDist[i]; - if (reg < dist) - { - dist = reg; - bestIdx = i; - } - } - - sdist[threadIdx.x] = dist; - strainIdx[threadIdx.x] = bestIdx; - __syncthreads(); - - reducePredVal(sdist, dist, strainIdx, bestIdx, threadIdx.x, less()); - - if (threadIdx.x == 0) - { - if (dist < numeric_limits::max()) - { - allDist[bestIdx] = numeric_limits::max(); - trainIdx[i] = bestIdx; - distance[i] = dist; - } - } - } - - /////////////////////////////////////////////////////////////////////////////// - // find knn match kernel caller - - template void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) - { - const dim3 threads(BLOCK_SIZE, 1, 1); - const dim3 grid(trainIdx.rows, 1, 1); - - for (int i = 0; i < k; ++i) - { - findBestMatch<<>>(allDist, i, trainIdx, distance); - cudaSafeCall( cudaGetLastError() ); - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) - { - findKnnMatch_caller<256>(k, static_cast(trainIdx), static_cast(distance), static_cast(allDist), stream); - } - - /////////////////////////////////////////////////////////////////////////////// - // knn match Dispatcher - - template - void knnMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, - int cc, cudaStream_t stream) - { - if (mask.data) - { - if (k == 2) - { - knnMatch2Dispatcher(query, train, SingleMask(mask), trainIdx, distance, cc, stream); - return; - } - - calcDistanceDispatcher(query, train, SingleMask(mask), allDist, stream); - } - else - { - if (k == 2) - { - knnMatch2Dispatcher(query, train, WithOutMask(), trainIdx, distance, cc, stream); - return; - } - - calcDistanceDispatcher(query, train, WithOutMask(), allDist, stream); - } - - findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); - } - - /////////////////////////////////////////////////////////////////////////////// - // knn match caller - - template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, - int cc, cudaStream_t stream) - { - knnMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); - } - - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - - template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, - int cc, cudaStream_t stream) - { - knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); - } - - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - - template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, - int cc, cudaStream_t stream) - { - knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); - } - - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - -/////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////// Radius Match ////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - - template - __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, float maxDistance, const Mask mask, - DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) - { - #if __CUDA_ARCH__ >= 110 - - __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; - - const int queryIdx = blockIdx.x; - const T* queryDescs = query.ptr(queryIdx); - - const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - - if (trainIdx < train.rows) - { - const T* trainDescs = train.ptr(trainIdx); - - if (mask(queryIdx, trainIdx)) - { - Dist dist; - - calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - - if (threadIdx.x == 0) - { - if (dist < maxDistance) - { - unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (i < trainIdx_.cols) - { - distance.ptr(queryIdx)[i] = dist; - trainIdx_.ptr(queryIdx)[i] = trainIdx; - } - } - } - } - } - - #endif - } - - /////////////////////////////////////////////////////////////////////////////// - // Radius Match kernel caller - - template - void radiusMatch_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2D_& nMatches, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); - - radiusMatch<<>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - /////////////////////////////////////////////////////////////////////////////// - // Radius Match Dispatcher - - template - void radiusMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, - cudaStream_t stream) - { - radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, - static_cast(trainIdx), static_cast< const DevMem2D_ >(nMatches), static_cast(distance), - stream); - } - - /////////////////////////////////////////////////////////////////////////////// - // Radius Match caller - - template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, - cudaStream_t stream) - { - if (mask.data) - { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, - stream); - } - else - { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, - stream); - } - } - - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - - template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, - cudaStream_t stream) - { - if (mask.data) - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, - stream); - } - else - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, - stream); - } - } - - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - - template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, - cudaStream_t stream) - { - if (mask.data) - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, - stream); - } - else - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, - stream); - } - } - - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); -}}} diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu new file mode 100644 index 0000000000..3c32769c52 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.cu @@ -0,0 +1,240 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "internal_shared.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +#define MAX_KERNEL_SIZE 16 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 + +namespace filter_krnls_column +{ + __constant__ float cLinearKernel[MAX_KERNEL_SIZE]; + + void loadLinearKernel(const float kernel[], int ksize) + { + cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); + } + + template + __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) + { + __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + + const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; + const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; + + T* sDataColumn = smem + threadIdx.x; + + if (x < src.cols) + { + const T* srcCol = src.ptr() + x; + + sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); + sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); + sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); + + __syncthreads(); + + if (y < src.rows) + { + typedef typename TypeVec::cn>::vec_type sum_t; + sum_t sum = VecTraits::all(0); + + sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; + + #pragma unroll + for(int i = 0; i < ksize; ++i) + sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; + + dst.ptr(y)[x] = saturate_cast(sum); + } + } + } +} + +namespace cv { namespace gpu { namespace filters +{ + template class B> + void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) + { + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + + B b(src.rows); + + if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1)) + { + cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, " + "try bigger image or another border extrapolation mode", __FILE__, __LINE__); + } + + filter_krnls_column::linearColumnFilter<<>>(src, dst, anchor, b); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) + { + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); + static const caller_t callers[5][17] = + { + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect101>, + linearColumnFilter_caller<2 , T, D, BrdColReflect101>, + linearColumnFilter_caller<3 , T, D, BrdColReflect101>, + linearColumnFilter_caller<4 , T, D, BrdColReflect101>, + linearColumnFilter_caller<5 , T, D, BrdColReflect101>, + linearColumnFilter_caller<6 , T, D, BrdColReflect101>, + linearColumnFilter_caller<7 , T, D, BrdColReflect101>, + linearColumnFilter_caller<8 , T, D, BrdColReflect101>, + linearColumnFilter_caller<9 , T, D, BrdColReflect101>, + linearColumnFilter_caller<10, T, D, BrdColReflect101>, + linearColumnFilter_caller<11, T, D, BrdColReflect101>, + linearColumnFilter_caller<12, T, D, BrdColReflect101>, + linearColumnFilter_caller<13, T, D, BrdColReflect101>, + linearColumnFilter_caller<14, T, D, BrdColReflect101>, + linearColumnFilter_caller<15, T, D, BrdColReflect101>, + linearColumnFilter_caller<16, T, D, BrdColReflect101> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReplicate>, + linearColumnFilter_caller<2 , T, D, BrdColReplicate>, + linearColumnFilter_caller<3 , T, D, BrdColReplicate>, + linearColumnFilter_caller<4 , T, D, BrdColReplicate>, + linearColumnFilter_caller<5 , T, D, BrdColReplicate>, + linearColumnFilter_caller<6 , T, D, BrdColReplicate>, + linearColumnFilter_caller<7 , T, D, BrdColReplicate>, + linearColumnFilter_caller<8 , T, D, BrdColReplicate>, + linearColumnFilter_caller<9 , T, D, BrdColReplicate>, + linearColumnFilter_caller<10, T, D, BrdColReplicate>, + linearColumnFilter_caller<11, T, D, BrdColReplicate>, + linearColumnFilter_caller<12, T, D, BrdColReplicate>, + linearColumnFilter_caller<13, T, D, BrdColReplicate>, + linearColumnFilter_caller<14, T, D, BrdColReplicate>, + linearColumnFilter_caller<15, T, D, BrdColReplicate>, + linearColumnFilter_caller<16, T, D, BrdColReplicate> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColConstant>, + linearColumnFilter_caller<2 , T, D, BrdColConstant>, + linearColumnFilter_caller<3 , T, D, BrdColConstant>, + linearColumnFilter_caller<4 , T, D, BrdColConstant>, + linearColumnFilter_caller<5 , T, D, BrdColConstant>, + linearColumnFilter_caller<6 , T, D, BrdColConstant>, + linearColumnFilter_caller<7 , T, D, BrdColConstant>, + linearColumnFilter_caller<8 , T, D, BrdColConstant>, + linearColumnFilter_caller<9 , T, D, BrdColConstant>, + linearColumnFilter_caller<10, T, D, BrdColConstant>, + linearColumnFilter_caller<11, T, D, BrdColConstant>, + linearColumnFilter_caller<12, T, D, BrdColConstant>, + linearColumnFilter_caller<13, T, D, BrdColConstant>, + linearColumnFilter_caller<14, T, D, BrdColConstant>, + linearColumnFilter_caller<15, T, D, BrdColConstant>, + linearColumnFilter_caller<16, T, D, BrdColConstant> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect>, + linearColumnFilter_caller<2 , T, D, BrdColReflect>, + linearColumnFilter_caller<3 , T, D, BrdColReflect>, + linearColumnFilter_caller<4 , T, D, BrdColReflect>, + linearColumnFilter_caller<5 , T, D, BrdColReflect>, + linearColumnFilter_caller<6 , T, D, BrdColReflect>, + linearColumnFilter_caller<7 , T, D, BrdColReflect>, + linearColumnFilter_caller<8 , T, D, BrdColReflect>, + linearColumnFilter_caller<9 , T, D, BrdColReflect>, + linearColumnFilter_caller<10, T, D, BrdColReflect>, + linearColumnFilter_caller<11, T, D, BrdColReflect>, + linearColumnFilter_caller<12, T, D, BrdColReflect>, + linearColumnFilter_caller<13, T, D, BrdColReflect>, + linearColumnFilter_caller<14, T, D, BrdColReflect>, + linearColumnFilter_caller<15, T, D, BrdColReflect>, + linearColumnFilter_caller<16, T, D, BrdColReflect> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColWrap>, + linearColumnFilter_caller<2 , T, D, BrdColWrap>, + linearColumnFilter_caller<3 , T, D, BrdColWrap>, + linearColumnFilter_caller<4 , T, D, BrdColWrap>, + linearColumnFilter_caller<5 , T, D, BrdColWrap>, + linearColumnFilter_caller<6 , T, D, BrdColWrap>, + linearColumnFilter_caller<7 , T, D, BrdColWrap>, + linearColumnFilter_caller<8 , T, D, BrdColWrap>, + linearColumnFilter_caller<9 , T, D, BrdColWrap>, + linearColumnFilter_caller<10, T, D, BrdColWrap>, + linearColumnFilter_caller<11, T, D, BrdColWrap>, + linearColumnFilter_caller<12, T, D, BrdColWrap>, + linearColumnFilter_caller<13, T, D, BrdColWrap>, + linearColumnFilter_caller<14, T, D, BrdColWrap>, + linearColumnFilter_caller<15, T, D, BrdColWrap>, + linearColumnFilter_caller<16, T, D, BrdColWrap>, + } + }; + + filter_krnls_column::loadLinearKernel(kernel, ksize); + + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); + } + + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + //template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + //template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +}}} diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu deleted file mode 100644 index 41f403a281..0000000000 --- a/modules/gpu/src/cuda/filters.cu +++ /dev/null @@ -1,633 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" - -using namespace cv::gpu; -using namespace cv::gpu::device; - -///////////////////////////////////////////////////////////////////////////////////////////////// -// Linear filters - -#define MAX_KERNEL_SIZE 16 -#define BLOCK_DIM_X 16 -#define BLOCK_DIM_Y 16 - -namespace filter_krnls -{ - __constant__ float cLinearKernel[MAX_KERNEL_SIZE]; -} - -namespace cv { namespace gpu { namespace filters -{ - void loadLinearKernel(const float kernel[], int ksize) - { - cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) ); - } -}}} - -namespace filter_krnls -{ - template struct SmemType_ - { - typedef typename TypeVec::cn>::vec_type smem_t; - }; - template struct SmemType_ - { - typedef T smem_t; - }; - template struct SmemType - { - typedef typename SmemType_::smem_t smem_t; - }; - - template - __global__ void linearRowFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) - { - typedef typename SmemType::smem_t smem_t; - - __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; - - smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; - - if (y < src.rows) - { - const T* rowSrc = src.ptr(y); - - sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); - sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); - sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); - - __syncthreads(); - - if (x < src.cols) - { - typedef typename TypeVec::cn>::vec_type sum_t; - sum_t sum = VecTraits::all(0); - - sDataRow += threadIdx.x + BLOCK_DIM_X - anchor; - - #pragma unroll - for(int i = 0; i < ksize; ++i) - sum = sum + sDataRow[i] * cLinearKernel[i]; - - dst.ptr(y)[x] = saturate_cast(sum); - } - } - } -} - -namespace cv { namespace gpu { namespace filters -{ - template class B> - void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) - { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - - typedef typename filter_krnls::SmemType::smem_t smem_t; - B b(src.cols); - - if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1)) - { - cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, " - "try bigger image or another border extrapolation mode", __FILE__, __LINE__); - } - - filter_krnls::linearRowFilter<<>>(src, dst, anchor, b); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) - { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = - { - { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReflect101>, - linearRowFilter_caller<2 , T, D, BrdRowReflect101>, - linearRowFilter_caller<3 , T, D, BrdRowReflect101>, - linearRowFilter_caller<4 , T, D, BrdRowReflect101>, - linearRowFilter_caller<5 , T, D, BrdRowReflect101>, - linearRowFilter_caller<6 , T, D, BrdRowReflect101>, - linearRowFilter_caller<7 , T, D, BrdRowReflect101>, - linearRowFilter_caller<8 , T, D, BrdRowReflect101>, - linearRowFilter_caller<9 , T, D, BrdRowReflect101>, - linearRowFilter_caller<10, T, D, BrdRowReflect101>, - linearRowFilter_caller<11, T, D, BrdRowReflect101>, - linearRowFilter_caller<12, T, D, BrdRowReflect101>, - linearRowFilter_caller<13, T, D, BrdRowReflect101>, - linearRowFilter_caller<14, T, D, BrdRowReflect101>, - linearRowFilter_caller<15, T, D, BrdRowReflect101>, - linearRowFilter_caller<16, T, D, BrdRowReflect101> - }, - { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReplicate>, - linearRowFilter_caller<2 , T, D, BrdRowReplicate>, - linearRowFilter_caller<3 , T, D, BrdRowReplicate>, - linearRowFilter_caller<4 , T, D, BrdRowReplicate>, - linearRowFilter_caller<5 , T, D, BrdRowReplicate>, - linearRowFilter_caller<6 , T, D, BrdRowReplicate>, - linearRowFilter_caller<7 , T, D, BrdRowReplicate>, - linearRowFilter_caller<8 , T, D, BrdRowReplicate>, - linearRowFilter_caller<9 , T, D, BrdRowReplicate>, - linearRowFilter_caller<10, T, D, BrdRowReplicate>, - linearRowFilter_caller<11, T, D, BrdRowReplicate>, - linearRowFilter_caller<12, T, D, BrdRowReplicate>, - linearRowFilter_caller<13, T, D, BrdRowReplicate>, - linearRowFilter_caller<14, T, D, BrdRowReplicate>, - linearRowFilter_caller<15, T, D, BrdRowReplicate>, - linearRowFilter_caller<16, T, D, BrdRowReplicate> - }, - { - 0, - linearRowFilter_caller<1 , T, D, BrdRowConstant>, - linearRowFilter_caller<2 , T, D, BrdRowConstant>, - linearRowFilter_caller<3 , T, D, BrdRowConstant>, - linearRowFilter_caller<4 , T, D, BrdRowConstant>, - linearRowFilter_caller<5 , T, D, BrdRowConstant>, - linearRowFilter_caller<6 , T, D, BrdRowConstant>, - linearRowFilter_caller<7 , T, D, BrdRowConstant>, - linearRowFilter_caller<8 , T, D, BrdRowConstant>, - linearRowFilter_caller<9 , T, D, BrdRowConstant>, - linearRowFilter_caller<10, T, D, BrdRowConstant>, - linearRowFilter_caller<11, T, D, BrdRowConstant>, - linearRowFilter_caller<12, T, D, BrdRowConstant>, - linearRowFilter_caller<13, T, D, BrdRowConstant>, - linearRowFilter_caller<14, T, D, BrdRowConstant>, - linearRowFilter_caller<15, T, D, BrdRowConstant>, - linearRowFilter_caller<16, T, D, BrdRowConstant> - }, - { - 0, - linearRowFilter_caller<1 , T, D, BrdRowReflect>, - linearRowFilter_caller<2 , T, D, BrdRowReflect>, - linearRowFilter_caller<3 , T, D, BrdRowReflect>, - linearRowFilter_caller<4 , T, D, BrdRowReflect>, - linearRowFilter_caller<5 , T, D, BrdRowReflect>, - linearRowFilter_caller<6 , T, D, BrdRowReflect>, - linearRowFilter_caller<7 , T, D, BrdRowReflect>, - linearRowFilter_caller<8 , T, D, BrdRowReflect>, - linearRowFilter_caller<9 , T, D, BrdRowReflect>, - linearRowFilter_caller<10, T, D, BrdRowReflect>, - linearRowFilter_caller<11, T, D, BrdRowReflect>, - linearRowFilter_caller<12, T, D, BrdRowReflect>, - linearRowFilter_caller<13, T, D, BrdRowReflect>, - linearRowFilter_caller<14, T, D, BrdRowReflect>, - linearRowFilter_caller<15, T, D, BrdRowReflect>, - linearRowFilter_caller<16, T, D, BrdRowReflect> - }, - { - 0, - linearRowFilter_caller<1 , T, D, BrdRowWrap>, - linearRowFilter_caller<2 , T, D, BrdRowWrap>, - linearRowFilter_caller<3 , T, D, BrdRowWrap>, - linearRowFilter_caller<4 , T, D, BrdRowWrap>, - linearRowFilter_caller<5 , T, D, BrdRowWrap>, - linearRowFilter_caller<6 , T, D, BrdRowWrap>, - linearRowFilter_caller<7 , T, D, BrdRowWrap>, - linearRowFilter_caller<8 , T, D, BrdRowWrap>, - linearRowFilter_caller<9 , T, D, BrdRowWrap>, - linearRowFilter_caller<10, T, D, BrdRowWrap>, - linearRowFilter_caller<11, T, D, BrdRowWrap>, - linearRowFilter_caller<12, T, D, BrdRowWrap>, - linearRowFilter_caller<13, T, D, BrdRowWrap>, - linearRowFilter_caller<14, T, D, BrdRowWrap>, - linearRowFilter_caller<15, T, D, BrdRowWrap>, - linearRowFilter_caller<16, T, D, BrdRowWrap> - } - }; - - loadLinearKernel(kernel, ksize); - - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); - } - - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -}}} - -namespace filter_krnls -{ - template - __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) - { - __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; - - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; - - T* sDataColumn = smem + threadIdx.x; - - if (x < src.cols) - { - const T* srcCol = src.ptr() + x; - - sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); - sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); - sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); - - __syncthreads(); - - if (y < src.rows) - { - typedef typename TypeVec::cn>::vec_type sum_t; - sum_t sum = VecTraits::all(0); - - sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; - - #pragma unroll - for(int i = 0; i < ksize; ++i) - sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; - - dst.ptr(y)[x] = saturate_cast(sum); - } - } - } -} - -namespace cv { namespace gpu { namespace filters -{ - template class B> - void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) - { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - - B b(src.rows); - - if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1)) - { - cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, " - "try bigger image or another border extrapolation mode", __FILE__, __LINE__); - } - - filter_krnls::linearColumnFilter<<>>(src, dst, anchor, b); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) - { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = - { - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect101>, - linearColumnFilter_caller<2 , T, D, BrdColReflect101>, - linearColumnFilter_caller<3 , T, D, BrdColReflect101>, - linearColumnFilter_caller<4 , T, D, BrdColReflect101>, - linearColumnFilter_caller<5 , T, D, BrdColReflect101>, - linearColumnFilter_caller<6 , T, D, BrdColReflect101>, - linearColumnFilter_caller<7 , T, D, BrdColReflect101>, - linearColumnFilter_caller<8 , T, D, BrdColReflect101>, - linearColumnFilter_caller<9 , T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReplicate>, - linearColumnFilter_caller<2 , T, D, BrdColReplicate>, - linearColumnFilter_caller<3 , T, D, BrdColReplicate>, - linearColumnFilter_caller<4 , T, D, BrdColReplicate>, - linearColumnFilter_caller<5 , T, D, BrdColReplicate>, - linearColumnFilter_caller<6 , T, D, BrdColReplicate>, - linearColumnFilter_caller<7 , T, D, BrdColReplicate>, - linearColumnFilter_caller<8 , T, D, BrdColReplicate>, - linearColumnFilter_caller<9 , T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColConstant>, - linearColumnFilter_caller<2 , T, D, BrdColConstant>, - linearColumnFilter_caller<3 , T, D, BrdColConstant>, - linearColumnFilter_caller<4 , T, D, BrdColConstant>, - linearColumnFilter_caller<5 , T, D, BrdColConstant>, - linearColumnFilter_caller<6 , T, D, BrdColConstant>, - linearColumnFilter_caller<7 , T, D, BrdColConstant>, - linearColumnFilter_caller<8 , T, D, BrdColConstant>, - linearColumnFilter_caller<9 , T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect>, - linearColumnFilter_caller<2 , T, D, BrdColReflect>, - linearColumnFilter_caller<3 , T, D, BrdColReflect>, - linearColumnFilter_caller<4 , T, D, BrdColReflect>, - linearColumnFilter_caller<5 , T, D, BrdColReflect>, - linearColumnFilter_caller<6 , T, D, BrdColReflect>, - linearColumnFilter_caller<7 , T, D, BrdColReflect>, - linearColumnFilter_caller<8 , T, D, BrdColReflect>, - linearColumnFilter_caller<9 , T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColWrap>, - linearColumnFilter_caller<2 , T, D, BrdColWrap>, - linearColumnFilter_caller<3 , T, D, BrdColWrap>, - linearColumnFilter_caller<4 , T, D, BrdColWrap>, - linearColumnFilter_caller<5 , T, D, BrdColWrap>, - linearColumnFilter_caller<6 , T, D, BrdColWrap>, - linearColumnFilter_caller<7 , T, D, BrdColWrap>, - linearColumnFilter_caller<8 , T, D, BrdColWrap>, - linearColumnFilter_caller<9 , T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - } - }; - - loadLinearKernel(kernel, ksize); - - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); - } - - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -}}} - -///////////////////////////////////////////////////////////////////////////////////////////////// -// Bilateral filters - -namespace bf_krnls -{ - __constant__ float* ctable_color; - __constant__ float* ctable_space; - __constant__ size_t ctable_space_step; - - __constant__ int cndisp; - __constant__ int cradius; - - __constant__ short cedge_disc; - __constant__ short cmax_disc; -} - -namespace cv { namespace gpu { namespace bf -{ - void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) - { - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); - size_t table_space_step = table_space.step / sizeof(float); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); - } -}}} - -namespace bf_krnls -{ - template - struct DistRgbMax - { - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) - { - uchar x = abs(a[0] - b[0]); - uchar y = abs(a[1] - b[1]); - uchar z = abs(a[2] - b[2]); - return (max(max(x, y), z)); - } - }; - - template <> - struct DistRgbMax<1> - { - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) - { - return abs(a[0] - b[0]); - } - }; - - template - __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) - { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); - - T dp[5]; - - if (y > 0 && y < h - 1 && x > 0 && x < w - 1) - { - dp[0] = *(disp + (y ) * disp_step + x + 0); - dp[1] = *(disp + (y-1) * disp_step + x + 0); - dp[2] = *(disp + (y ) * disp_step + x - 1); - dp[3] = *(disp + (y+1) * disp_step + x + 0); - dp[4] = *(disp + (y ) * disp_step + x + 1); - - if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) - { - const int ymin = max(0, y - cradius); - const int xmin = max(0, x - cradius); - const int ymax = min(h - 1, y + cradius); - const int xmax = min(w - 1, x + cradius); - - float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - - const uchar* ic = img + y * img_step + channels * x; - - for(int yi = ymin; yi <= ymax; yi++) - { - const T* disp_y = disp + yi * disp_step; - - for(int xi = xmin; xi <= xmax; xi++) - { - const uchar* in = img + yi * img_step + channels * xi; - - uchar dist_rgb = DistRgbMax::calc(in, ic); - - const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; - - const T disp_reg = disp_y[xi]; - - cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; - cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; - cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; - cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; - cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; - } - } - - float minimum = numeric_limits::max(); - int id = 0; - - if (cost[0] < minimum) - { - minimum = cost[0]; - id = 0; - } - if (cost[1] < minimum) - { - minimum = cost[1]; - id = 1; - } - if (cost[2] < minimum) - { - minimum = cost[2]; - id = 2; - } - if (cost[3] < minimum) - { - minimum = cost[3]; - id = 3; - } - if (cost[4] < minimum) - { - minimum = cost[4]; - id = 4; - } - - *(disp + y * disp_step + x) = dp[id]; - } - } - } -} - -namespace cv { namespace gpu { namespace bf -{ - template - void bilateral_filter_caller(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - grid.x = divUp(disp.cols, threads.x << 1); - grid.y = divUp(disp.rows, threads.y); - - switch (channels) - { - case 1: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - bf_krnls::bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - } - break; - case 3: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - bf_krnls::bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - } - break; - default: - cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); - } - - if (stream != 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } - - void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } -}}} diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index d76f93b9a5..67cfa59b42 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -41,433 +41,16 @@ //M*/ #include "internal_shared.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/filters.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" using namespace cv::gpu; using namespace cv::gpu::device; -/////////////////////////////////// Remap /////////////////////////////////////////////// namespace cv { namespace gpu { namespace imgproc { - template __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_ dst) - { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (x < dst.cols && y < dst.rows) - { - const float xcoo = mapx.ptr(y)[x]; - const float ycoo = mapy.ptr(y)[x]; - - dst.ptr(y)[x] = saturate_cast(src(ycoo, xcoo)); - } - } - - template