diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index d4c7fd4562..ee7aae9962 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1221,26 +1221,24 @@ namespace cv explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); - // Add descriptors to train descriptor collection. + // Add descriptors to train descriptor collection void add(const std::vector& descCollection); - // Get train descriptors collection. + // Get train descriptors collection const std::vector& getTrainDescriptors() const; - // Clear train descriptors collection. + // Clear train descriptors collection void clear(); - // Return true if there are not train descriptors in collection. + // Return true if there are not train descriptors in collection bool empty() const; - // Return true if the matcher supports mask in match methods. + // Return true if the matcher supports mask in match methods bool isMaskSupported() const; - // Find one best match for each query descriptor. - // trainIdx.at(0, queryIdx) will contain best train index for queryIdx - // distance.at(0, queryIdx) will contain distance - void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, + // Find one best match for each query descriptor + void matchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx and distance and convert it to CPU vector with DMatch @@ -1248,21 +1246,16 @@ namespace cv // Convert trainIdx and distance to vector with DMatch static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches); - // Find one best match for each query descriptor. - void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector& matches, - const GpuMat& mask = GpuMat()); + // Find one best match for each query descriptor + void match(const GpuMat& query, const GpuMat& train, std::vector& matches, const GpuMat& mask = GpuMat()); // Make gpu collection of trains and masks in suitable format for matchCollection function - void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, - const vector& masks = std::vector()); + void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector& masks = std::vector()); - // Find one best match from train collection for each query descriptor. - // trainIdx.at(0, queryIdx) will contain best train index for queryIdx - // imgIdx.at(0, queryIdx) will contain best image index for queryIdx - // distance.at(0, queryIdx) will contain distance - void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, + // Find one best match from train collection for each query descriptor + void matchCollection(const GpuMat& query, const GpuMat& trainCollection, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, - const GpuMat& maskCollection, Stream& stream = Stream::Null()); + const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx, imgIdx and distance and convert it to vector with DMatch static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector& matches); @@ -1270,17 +1263,12 @@ namespace cv static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector& matches); // Find one best match from train collection for each query descriptor. - void match(const GpuMat& queryDescs, std::vector& matches, const std::vector& masks = std::vector()); + void match(const GpuMat& query, std::vector& matches, const std::vector& masks = std::vector()); - // Find k best matches for each query descriptor (in increasing order of distances). - // trainIdx.at(queryIdx, i) will contain index of i'th best trains (i < k). - // distance.at(queryIdx, i) will contain distance. - // allDist is a buffer to store all distance between query descriptors and train descriptors - // it have size (nQuery,nTrain) and CV_32F type - // allDist.at(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, - // otherwise it will contain distance between queryIdx and trainIdx descriptors - void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); + // Find k best matches for each query descriptor (in increasing order of distances) + void knnMatchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); // Download trainIdx and distance and convert it to vector with DMatch // compactResult is used when mask is not empty. If compactResult is false matches @@ -1296,27 +1284,40 @@ namespace cv // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + void knnMatch(const GpuMat& query, const GpuMat& train, std::vector< std::vector >& matches, int k, const GpuMat& mask = GpuMat(), bool compactResult = false); + // Find k best matches from train collection for each query descriptor (in increasing order of distances) + void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx and distance and convert it to vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + // Find k best matches for each query descriptor (in increasing order of distances). // compactResult is used when mask is not empty. If compactResult is false matches // vector will have the same size as queryDescriptors rows. If compactResult is true // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const GpuMat& queryDescs, std::vector< std::vector >& matches, int knn, - const std::vector& masks = std::vector(), bool compactResult = false ); + void knnMatch(const GpuMat& query, std::vector< std::vector >& matches, int k, + const std::vector& masks = std::vector(), bool compactResult = false); // Find best matches for each query descriptor which have distance less than maxDistance. // nMatches.at(0, queryIdx) will contain matches count for queryIdx. // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, // because it didn't have enough memory. - // trainIdx.at(queruIdx, i) will contain ith train index (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) - // distance.at(queruIdx, i) will contain ith distance (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x (nTrain / 2), + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10), // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches // Matches doesn't sorted. - void radiusMatchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, + void radiusMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); @@ -1333,15 +1334,16 @@ namespace cv // Find best matches for each query descriptor which have distance less than maxDistance // in increasing order of distances). - void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + void radiusMatch(const GpuMat& query, const GpuMat& train, std::vector< std::vector >& matches, float maxDistance, const GpuMat& mask = GpuMat(), bool compactResult = false); // Find best matches for each query descriptor which have distance less than maxDistance. + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10), + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches // Matches doesn't sorted. - void radiusMatchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, - const GpuMat& maskCollection, Stream& stream = Stream::Null()); + void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const std::vector& masks = std::vector(), Stream& stream = Stream::Null()); // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. // matches will be sorted in increasing order of distances. @@ -1356,7 +1358,7 @@ namespace cv // Find best matches from train collection for each query descriptor which have distance less than // maxDistance (in increasing order of distances). - void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector >& matches, float maxDistance, + void radiusMatch(const GpuMat& query, std::vector< std::vector >& matches, float maxDistance, const std::vector& masks = std::vector(), bool compactResult = false); DistType distType; diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp index 68e3a7b60f..894c7520b9 100644 --- a/modules/gpu/perf/perf_features2d.cpp +++ b/modules/gpu/perf/perf_features2d.cpp @@ -1,7 +1,7 @@ #include "perf_precomp.hpp" PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing::ValuesIn(devices()), - testing::Values(64, 128))) + testing::Values(64, 128, 256))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); int desc_size = std::tr1::get<1>(GetParam()); @@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing: BruteForceMatcher_GPU< L2 > matcher; - declare.time(0.5).iterations(100); + declare.time(3.0); SIMPLE_TEST_CYCLE() { @@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing: PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(testing::ValuesIn(devices()), testing::Values(2, 3), - testing::Values(64, 128))) + testing::Values(64, 128, 256))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); int k = std::tr1::get<1>(GetParam()); @@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes BruteForceMatcher_GPU< L2 > matcher; - declare.time(0.5).iterations(100); + declare.time(3.0); SIMPLE_TEST_CYCLE() { - matcher.knnMatch(query, train, trainIdx, distance, allDist, k); + matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k); } Mat trainIdx_host(trainIdx); @@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes } PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(testing::ValuesIn(devices(SHARED_ATOMICS)), - testing::Values(64, 128))) + testing::Values(64, 128, 256))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); int desc_size = std::tr1::get<1>(GetParam()); @@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te BruteForceMatcher_GPU< L2 > matcher; - declare.time(0.5).iterations(100); + declare.time(3.0); SIMPLE_TEST_CYCLE() { diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 24d20d63b4..b601521950 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -56,86 +56,101 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; } void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, std::vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, const Mat&, std::vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector&, const std::vector&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, int, const GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector >&, int, const std::vector&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, const Mat&, vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, vector&, const vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, vector< vector >&, int, const GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat&, const Mat&, const Mat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, vector< vector >&, int, const vector&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector >&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, float, const GpuMat&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, vector< vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, vector< vector >&, float, const GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const vector&, Stream&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, vector< vector >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, vector< vector >&, bool) { throw_nogpu(); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector >&, float, const std::vector&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vector >&, float, const vector&, bool) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ namespace cv { namespace gpu { namespace bf_match { - template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); }}} namespace cv { namespace gpu { namespace bf_knnmatch { - template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream); + + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); }}} namespace cv { namespace gpu { namespace bf_radius_match { - template void radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); - template void radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); - template void radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); - template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); - template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + + template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); }}} cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) @@ -173,52 +188,53 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const //////////////////////////////////////////////////////////////////// // Match -void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, + const GpuMat& mask, Stream& stream) { - if (queryDescs.empty() || trainDescs.empty()) + if (query.empty() || train.empty()) return; using namespace cv::gpu::bf_match; - typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream); + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); - static const match_caller_t match_callers[3][8] = + static const caller_t callers[3][6] = { { - matchSingleL1_gpu, 0/*matchSingleL1_gpu*/, - matchSingleL1_gpu, matchSingleL1_gpu, - matchSingleL1_gpu, matchSingleL1_gpu, 0, 0 + matchL1_gpu, 0/*matchL1_gpu*/, + matchL1_gpu, matchL1_gpu, + matchL1_gpu, matchL1_gpu }, { - 0/*matchSingleL2_gpu*/, 0/*matchSingleL2_gpu*/, - 0/*matchSingleL2_gpu*/, 0/*matchSingleL2_gpu*/, - 0/*matchSingleL2_gpu*/, matchSingleL2_gpu, 0, 0 + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, matchL2_gpu }, { - matchSingleHamming_gpu, 0/*matchSingleHamming_gpu*/, - matchSingleHamming_gpu, 0/*matchSingleHamming_gpu*/, - matchSingleHamming_gpu, 0, 0, 0 + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/ } }; - CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); - CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type()); + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + CV_Assert(train.cols == query.cols && train.type() == query.type()); - const int nQuery = queryDescs.rows; + const int nQuery = query.rows; ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32F, distance); - match_caller_t func = match_callers[distType][queryDescs.depth()]; + caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); - func(queryDescs, trainDescs, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector& matches) @@ -232,13 +248,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, matchConvert(trainIdxCPU, distanceCPU, matches); } -void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches) +void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& distance, vector& matches) { if (trainIdx.empty() || distance.empty()) return; - CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.cols == trainIdx.cols); + CV_Assert(trainIdx.type() == CV_32SC1); + CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -250,6 +266,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; + if (trainIdx == -1) continue; @@ -261,11 +278,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons } } -void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs, +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& query, const GpuMat& train, vector& matches, const GpuMat& mask) { GpuMat trainIdx, distance; - matchSingle(queryDescs, trainDescs, trainIdx, distance, mask); + matchSingle(query, train, trainIdx, distance, mask); matchDownload(trainIdx, distance, matches); } @@ -279,14 +296,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect { Mat trainCollectionCPU(1, static_cast(trainDescCollection.size()), CV_8UC(sizeof(DevMem2D))); - for (size_t i = 0; i < trainDescCollection.size(); ++i) - { - const GpuMat& trainDescs = trainDescCollection[i]; + DevMem2D* trainCollectionCPU_ptr = trainCollectionCPU.ptr(); - trainCollectionCPU.ptr(0)[i] = trainDescs; - } + for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr) + *trainCollectionCPU_ptr = trainDescCollection[i]; trainCollection.upload(trainCollectionCPU); + maskCollection.release(); } else { @@ -295,16 +311,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect Mat trainCollectionCPU(1, static_cast(trainDescCollection.size()), CV_8UC(sizeof(DevMem2D))); Mat maskCollectionCPU(1, static_cast(trainDescCollection.size()), CV_8UC(sizeof(PtrStep))); - for (size_t i = 0; i < trainDescCollection.size(); ++i) + DevMem2D* trainCollectionCPU_ptr = trainCollectionCPU.ptr(); + PtrStep* maskCollectionCPU_ptr = maskCollectionCPU.ptr(); + + for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr) { - const GpuMat& trainDescs = trainDescCollection[i]; + const GpuMat& train = trainDescCollection[i]; const GpuMat& mask = masks[i]; - CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.cols == trainDescs.rows)); + CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows)); - trainCollectionCPU.ptr(0)[i] = trainDescs; - - maskCollectionCPU.ptr(0)[i] = mask; + *trainCollectionCPU_ptr = train; + *maskCollectionCPU_ptr = mask; } trainCollection.upload(trainCollectionCPU); @@ -312,52 +330,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect } } -void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& masks, Stream& stream) { - if (queryDescs.empty() || trainCollection.empty()) + if (query.empty() || trainCollection.empty()) return; using namespace cv::gpu::bf_match; - typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream); + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); - static const match_caller_t match_callers[3][8] = + static const caller_t callers[3][6] = { { - matchCollectionL1_gpu, 0/*matchCollectionL1_gpu*/, - matchCollectionL1_gpu, matchCollectionL1_gpu, - matchCollectionL1_gpu, matchCollectionL1_gpu, 0, 0 + matchL1_gpu, 0/*matchL1_gpu*/, + matchL1_gpu, matchL1_gpu, + matchL1_gpu, matchL1_gpu }, { - 0/*matchCollectionL2_gpu*/, 0/*matchCollectionL2_gpu*/, - 0/*matchCollectionL2_gpu*/, 0/*matchCollectionL2_gpu*/, - 0/*matchCollectionL2_gpu*/, matchCollectionL2_gpu, 0, 0 + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, matchL2_gpu }, { - matchCollectionHamming_gpu, 0/*matchCollectionHamming_gpu*/, - matchCollectionHamming_gpu, 0/*matchCollectionHamming_gpu*/, - matchCollectionHamming_gpu, 0, 0, 0 + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/ } }; - CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); - const int nQuery = queryDescs.rows; + const int nQuery = query.rows; ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx); ensureSizeIsEnough(1, nQuery, CV_32F, distance); - match_caller_t func = match_callers[distType][queryDescs.depth()]; + caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); - func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector& matches) @@ -377,9 +396,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons if (trainIdx.empty() || imgIdx.empty() || distance.empty()) return; - CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous() && imgIdx.cols == trainIdx.cols); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && imgIdx.cols == trainIdx.cols); + CV_Assert(trainIdx.type() == CV_32SC1); + CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.cols == trainIdx.cols); + CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -392,6 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr) { int trainIdx = *trainIdx_ptr; + if (trainIdx == -1) continue; @@ -405,7 +425,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons } } -void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector& matches, const vector& masks) +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& query, vector& matches, const vector& masks) { GpuMat trainCollection; GpuMat maskCollection; @@ -414,46 +434,50 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector GpuMat trainIdx, imgIdx, distance; - matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection); + matchCollection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection); matchDownload(trainIdx, imgIdx, distance, matches); } //////////////////////////////////////////////////////////////////// // KnnMatch -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, + const GpuMat& mask, Stream& stream) { - if (queryDescs.empty() || trainDescs.empty()) + if (query.empty() || train.empty()) return; using namespace cv::gpu::bf_knnmatch; - typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, - int cc, cudaStream_t stream); + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream); - static const match_caller_t match_callers[3][8] = + static const caller_t callers[3][6] = { { - knnMatchL1_gpu, 0/*knnMatchL1_gpu*/, knnMatchL1_gpu, - knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, 0, 0 + matchL1_gpu, 0/*matchL1_gpu*/, + matchL1_gpu, matchL1_gpu, + matchL1_gpu, matchL1_gpu }, { - 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, - 0/*knnMatchL2_gpu*/, 0/*knnMatchL2_gpu*/, knnMatchL2_gpu, 0, 0 + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, matchL2_gpu }, { - knnMatchHamming_gpu, 0/*knnMatchHamming_gpu*/, knnMatchHamming_gpu, - 0/*knnMatchHamming_gpu*/, knnMatchHamming_gpu, 0, 0, 0 + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/ } }; - CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); - CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols); + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + CV_Assert(train.type() == query.type() && train.cols == query.cols); - const int nQuery = queryDescs.rows; - const int nTrain = trainDescs.rows; + const int nQuery = query.rows; + const int nTrain = train.rows; if (k == 2) { @@ -468,25 +492,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con } if (stream) - { stream.enqueueMemSet(trainIdx, Scalar::all(-1)); - if (k != 2) - stream.enqueueMemSet(allDist, Scalar::all(numeric_limits::max())); - } else - { trainIdx.setTo(Scalar::all(-1)); - if (k != 2) - allDist.setTo(Scalar::all(numeric_limits::max())); - } - match_caller_t func = match_callers[distType][queryDescs.depth()]; + caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); DeviceInfo info; int cc = info.majorVersion() * 10 + info.minorVersion(); - func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); + func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -502,7 +518,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, - std::vector< std::vector >& matches, bool compactResult) + vector< vector >& matches, bool compactResult) { if (trainIdx.empty() || distance.empty()) return; @@ -546,14 +562,127 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c } } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& query, const GpuMat& train, vector< vector >& matches, int k, const GpuMat& mask, bool compactResult) { GpuMat trainIdx, distance, allDist; - knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask); + knnMatchSingle(query, train, trainIdx, distance, allDist, k, mask); knnMatchDownload(trainIdx, distance, matches, compactResult); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& maskCollection, Stream& stream) +{ + if (query.empty() || trainCollection.empty()) + return; + + using namespace cv::gpu::bf_knnmatch; + + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + + static const caller_t callers[3][6] = + { + { + match2L1_gpu, 0/*match2L1_gpu*/, + match2L1_gpu, match2L1_gpu, + match2L1_gpu, match2L1_gpu + }, + { + 0/*match2L2_gpu*/, 0/*match2L2_gpu*/, + 0/*match2L2_gpu*/, 0/*match2L2_gpu*/, + 0/*match2L2_gpu*/, match2L2_gpu + }, + { + match2Hamming_gpu, 0/*match2Hamming_gpu*/, + match2Hamming_gpu, 0/*match2Hamming_gpu*/, + match2Hamming_gpu, 0/*match2Hamming_gpu*/ + } + }; + + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + + const int nQuery = query.rows; + + ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); + ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx); + ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); + + if (stream) + stream.enqueueMemSet(trainIdx, Scalar::all(-1)); + else + trainIdx.setTo(Scalar::all(-1)); + + caller_t func = callers[distType][query.depth()]; + CV_Assert(func != 0); + + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); + + func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, + vector< vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || imgIdx.empty() || distance.empty()) + return; + + Mat trainIdxCPU = trainIdx; + Mat imgIdxCPU = imgIdx; + Mat distanceCPU = distance; + + knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, + vector< vector >& matches, bool compactResult) +{ + if (trainIdx.empty() || imgIdx.empty() || distance.empty()) + return; + + CV_Assert(trainIdx.type() == CV_32SC2); + CV_Assert(imgIdx.type() == CV_32SC2 && imgIdx.cols == trainIdx.cols); + CV_Assert(distance.type() == CV_32FC2 && distance.cols == trainIdx.cols); + + const int nQuery = trainIdx.cols; + + matches.clear(); + matches.reserve(nQuery); + + const int* trainIdx_ptr = trainIdx.ptr(); + const int* imgIdx_ptr = imgIdx.ptr(); + const float* distance_ptr = distance.ptr(); + + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + matches.push_back(vector()); + vector& curMatches = matches.back(); + curMatches.reserve(2); + + for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + + if (trainIdx != -1) + { + int imgIdx = *imgIdx_ptr; + + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, imgIdx, distance); + + curMatches.push_back(m); + } + } + + if (compactResult && curMatches.empty()) + matches.pop_back(); + } +} + namespace { struct ImgIdxSetter @@ -564,103 +693,123 @@ namespace }; } -void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, - vector< vector >& matches, int knn, const vector& masks, bool compactResult) +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& query, vector< vector >& matches, int k, + const vector& masks, bool compactResult) { - if (queryDescs.empty() || empty()) - return; - - vector< vector > curMatches; - vector temp; - temp.reserve(2 * knn); - - matches.resize(queryDescs.rows); - for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector::reserve), knn)); - - for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx) + if (k == 2) { - knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn, - masks.empty() ? GpuMat() : masks[imgIdx]); + GpuMat trainCollection; + GpuMat maskCollection; - for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx) - { - vector& localMatch = curMatches[queryIdx]; - vector& globalMatch = matches[queryIdx]; + makeGpuCollection(trainCollection, maskCollection, masks); - for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast(imgIdx))); + GpuMat trainIdx, imgIdx, distance; - temp.clear(); - merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp)); - - globalMatch.clear(); - const size_t count = std::min((size_t)knn, temp.size()); - copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch)); - } + knnMatch2Collection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection); + knnMatch2Download(trainIdx, imgIdx, distance, matches); } - - if (compactResult) + else { - vector< vector >::iterator new_end = remove_if(matches.begin(), matches.end(), - mem_fun_ref(&vector::empty)); - matches.erase(new_end, matches.end()); + if (query.empty() || empty()) + return; + + vector< vector > curMatches; + vector temp; + temp.reserve(2 * k); + + matches.resize(query.rows); + for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector::reserve), k)); + + for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx) + { + knnMatch(query, trainDescCollection[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]); + + for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx) + { + vector& localMatch = curMatches[queryIdx]; + vector& globalMatch = matches[queryIdx]; + + for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast(imgIdx))); + + temp.clear(); + merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp)); + + globalMatch.clear(); + const size_t count = std::min((size_t)k, temp.size()); + copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch)); + } + } + + if (compactResult) + { + vector< vector >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&vector::empty)); + matches.erase(new_end, matches.end()); + } } } //////////////////////////////////////////////////////////////////// // RadiusMatch -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, - GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const GpuMat& mask, Stream& stream) { - if (queryDescs.empty() || trainDescs.empty()) + if (query.empty() || train.empty()) return; using namespace cv::gpu::bf_radius_match; - typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); - static const radiusMatch_caller_t radiusMatch_callers[3][8] = + static const caller_t callers[3][6] = { { - radiusMatchSingleL1_gpu, 0/*radiusMatchSingleL1_gpu*/, radiusMatchSingleL1_gpu, - radiusMatchSingleL1_gpu, radiusMatchSingleL1_gpu, radiusMatchSingleL1_gpu, 0, 0 + matchL1_gpu, 0/*matchL1_gpu*/, + matchL1_gpu, matchL1_gpu, + matchL1_gpu, matchL1_gpu }, { - 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, - 0/*radiusMatchSingleL2_gpu*/, 0/*radiusMatchSingleL2_gpu*/, radiusMatchSingleL2_gpu, 0, 0 + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, matchL2_gpu }, { - radiusMatchSingleHamming_gpu, 0/*radiusMatchSingleHamming_gpu*/, radiusMatchSingleHamming_gpu, - 0/*radiusMatchSingleHamming_gpu*/, radiusMatchSingleHamming_gpu, 0, 0, 0 + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/ } }; - CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); - const int nQuery = queryDescs.rows; - const int nTrain = trainDescs.rows; + CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS)); - CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); - CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols); + const int nQuery = query.rows; + const int nTrain = train.rows; + + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { - ensureSizeIsEnough(nQuery, nTrain / 2, CV_32SC1, trainIdx); - ensureSizeIsEnough(nQuery, nTrain / 2, CV_32FC1, distance); + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); } - radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; - CV_Assert(func != 0); + caller_t func = callers[distType][query.depth()]; + CV_Assert(func != 0); - func(queryDescs, trainDescs, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream)); + func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, - vector< vector >& matches, bool compactResult) + vector< vector >& matches, bool compactResult) { if (trainIdx.empty() || distance.empty() || nMatches.empty()) return; @@ -673,14 +822,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, - vector< vector >& matches, bool compactResult) + vector< vector >& matches, bool compactResult) { if (trainIdx.empty() || distance.empty() || nMatches.empty()) return; CV_Assert(trainIdx.type() == CV_32SC1); - CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); + CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows); const int nQuery = trainIdx.rows; @@ -688,6 +837,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx matches.reserve(nQuery); const int* nMatches_ptr = nMatches.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) { const int* trainIdx_ptr = trainIdx.ptr(queryIdx); @@ -720,66 +870,75 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx } } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const GpuMat& train, vector< vector >& matches, float maxDistance, const GpuMat& mask, bool compactResult) { GpuMat trainIdx, distance, nMatches; - radiusMatchSingle(queryDescs, trainDescs, trainIdx, distance, nMatches, maxDistance, mask); + radiusMatchSingle(query, train, trainIdx, distance, nMatches, maxDistance, mask); radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult); } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, - const GpuMat& maskCollection, Stream& stream) +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, + float maxDistance, const vector& masks, Stream& stream) { - if (queryDescs.empty() || trainCollection.empty()) + if (query.empty() || empty()) return; using namespace cv::gpu::bf_radius_match; - typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream); + typedef void (*caller_t)(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); - static const radiusMatch_caller_t radiusMatch_callers[3][8] = + static const caller_t callers[3][6] = { { - radiusMatchCollectionL1_gpu, 0/*radiusMatchCollectionL1_gpu*/, radiusMatchCollectionL1_gpu, - radiusMatchCollectionL1_gpu, radiusMatchCollectionL1_gpu, radiusMatchCollectionL1_gpu, 0, 0 + matchL1_gpu, 0/*matchL1_gpu*/, + matchL1_gpu, matchL1_gpu, + matchL1_gpu, matchL1_gpu }, { - 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, - 0/*radiusMatchCollectionL2_gpu*/, 0/*radiusMatchCollectionL2_gpu*/, radiusMatchCollectionL2_gpu, 0, 0 + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, 0/*matchL2_gpu*/, + 0/*matchL2_gpu*/, matchL2_gpu }, { - radiusMatchCollectionHamming_gpu, 0/*radiusMatchCollectionHamming_gpu*/, radiusMatchCollectionHamming_gpu, - 0/*radiusMatchCollectionHamming_gpu*/, radiusMatchCollectionHamming_gpu, 0, 0, 0 + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/, + matchHamming_gpu, 0/*matchHamming_gpu*/ } }; - CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); - const int nQuery = queryDescs.rows; + CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS)); - CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); + const int nQuery = query.rows; + + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size())); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { - ensureSizeIsEnough(nQuery, nQuery / 2, CV_32SC1, trainIdx); - ensureSizeIsEnough(nQuery, nQuery / 2, CV_32SC1, imgIdx); - ensureSizeIsEnough(nQuery, nQuery / 2, CV_32FC1, distance); + ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx); + ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance); } - radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; + caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - func(queryDescs, trainCollection, maxDistance, maskCollection, trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream)); + vector trains_(trainDescCollection.begin(), trainDescCollection.end()); + vector masks_(masks.begin(), masks.end()); + + func(query, &trains_[0], static_cast(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], + trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, - vector< vector >& matches, bool compactResult) + vector< vector >& matches, bool compactResult) { if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty()) return; @@ -801,7 +960,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx CV_Assert(trainIdx.type() == CV_32SC1); CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.size() == trainIdx.size()); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); - CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); + CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows); const int nQuery = trainIdx.rows; @@ -809,6 +968,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx matches.reserve(nQuery); const int* nMatches_ptr = nMatches.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) { const int* trainIdx_ptr = trainIdx.ptr(queryIdx); @@ -843,18 +1003,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx } } -void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector >& matches, +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, vector< vector >& matches, float maxDistance, const vector& masks, bool compactResult) { - GpuMat trainCollection; - GpuMat maskCollection; - - makeGpuCollection(trainCollection, maskCollection, masks); - GpuMat trainIdx, imgIdx, distance, nMatches; - - radiusMatchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, nMatches, maxDistance, maskCollection); - + radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); } diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index d67c92aa13..c808509979 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -49,153 +49,334 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace bf_knnmatch { - template - __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, - typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, - typename Dist::result_type* smem) + /////////////////////////////////////////////////////////////////////////////// + // Reduction + + template + __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + float* s_distance, int* s_trainIdx) { - const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); - - typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; - - distMin1 = numeric_limits::max(); - distMin2 = numeric_limits::max(); + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - bestTrainIdx1 = -1; - bestTrainIdx2 = -1; + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; + + __syncthreads(); + + if (threadIdx.x == 0) { - if (m(queryIdx, trainIdx)) + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) { - Dist dist; + float val = s_distance[i]; - const T* trainRow = train.ptr(trainIdx); - - vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); - - const typename Dist::result_type val = dist; - - if (val < distMin1) + if (val < myBestDistance1) { - distMin1 = val; + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + } + else if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + } + } + } + + __syncthreads(); + + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; + + __syncthreads(); + + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + } + } + } + + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; + + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; + } + + template + __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2, + float* s_distance, int* s_trainIdx, int* s_imgIdx) + { + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; + + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; + + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; + s_imgIdx[threadIdx.x] = bestImgIdx1; + + __syncthreads(); + + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestImgIdx2 = myBestImgIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + myBestImgIdx1 = s_imgIdx[i]; + } + else if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; + } + } + } + + __syncthreads(); + + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; + s_imgIdx[threadIdx.x] = bestImgIdx2; + + __syncthreads(); + + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; + } + } + } + + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; + + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; + + bestImgIdx1 = myBestImgIdx1; + bestImgIdx2 = myBestImgIdx2; + } + + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled Cached + + template + __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) + { + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; + } + } + + template + __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0; + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; bestTrainIdx1 = trainIdx; } - else if (val < distMin2) + else if (distVal < bestDistance2) { - distMin2 = val; + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; bestTrainIdx2 = trainIdx; } } } } - template - __global__ void knnMatch2(const PtrStep_ query, const DevMem2D_ train, const Mask m, int2* trainIdx, float2* distance) + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { - typedef typename Dist::result_type result_type; - typedef typename Dist::value_type value_type; + extern __shared__ int smem[]; - __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - const int queryIdx = blockIdx.x; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - result_type distMin1; - result_type distMin2; + loadQueryToSmem(queryIdx, query, s_query); - int bestTrainIdx1; - int bestTrainIdx2; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - distanceCalcLoop(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); __syncthreads(); - volatile result_type* sdistMinRow = smem; - volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - if (threadIdx.x == 0) + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - sdistMinRow[threadIdx.y] = distMin1; - sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; - - sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; - sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; - } - __syncthreads(); - - if (threadIdx.x == 0 && threadIdx.y == 0) - { - distMin1 = numeric_limits::max(); - distMin2 = numeric_limits::max(); - - bestTrainIdx1 = -1; - bestTrainIdx2 = -1; - - #pragma unroll - for (int i = 0; i < BLOCK_DIM_Y; ++i) - { - result_type val = sdistMinRow[i]; - - if (val < distMin1) - { - distMin1 = val; - bestTrainIdx1 = sbestTrainIdxRow[i]; - } - else if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = sbestTrainIdxRow[i]; - } - } - - #pragma unroll - for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) - { - result_type val = sdistMinRow[i]; - - if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = sbestTrainIdxRow[i]; - } - } - - trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2); - distance[queryIdx] = make_float2(distMin1, distMin2); + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } } - /////////////////////////////////////////////////////////////////////////////// - // Knn 2 Match kernel caller - - template - void knnMatch2Simple_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) { - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - knnMatch2, Dist, T> - <<>>(query, train, mask, trainIdx, distance); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void knnMatch2Cached_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { - StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX + extern __shared__ int smem[]; - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - knnMatch2, Dist, T> - <<>>(query, train, mask, trainIdx.data, distance.data); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + + loadQueryToSmem(queryIdx, query, s_query); + + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; + + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } + + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -203,142 +384,597 @@ namespace cv { namespace gpu { namespace bf_knnmatch } /////////////////////////////////////////////////////////////////////////////// - // Knn 2 Match Dispatcher - - template - void knnMatch2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + // Match Unrolled + + template + __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) { - if (query.cols < 64) + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - knnMatch2Cached_caller<16, 16, 64, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) + { + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; + } + } } - else if (query.cols == 64) + } + + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - knnMatch2Cached_caller<16, 16, 64, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } - else if (query.cols < 128) + } + + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; + + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) { - knnMatch2Cached_caller<16, 16, 128, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); } - else if (query.cols == 128 && cc >= 12) + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - knnMatch2Cached_caller<16, 16, 128, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } - else if (query.cols < 256 && cc >= 12) + } + + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match + + template + __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - knnMatch2Cached_caller<16, 16, 256, false, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) + { + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; + } + } } - else if (query.cols == 256 && cc >= 12) + } + + template + __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - knnMatch2Cached_caller<16, 16, 256, true, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } + + template + void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; + + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } + + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // knnMatch 2 dispatcher + + template + void match2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); } else { - knnMatch2Simple_caller<16, 16, Dist>( - query, train, mask, - static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), - stream); + match<16, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); } } - + + template + void match2Dispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else + { + match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + } + /////////////////////////////////////////////////////////////////////////////// // Calc distance kernel - template - __global__ void calcDistance(const PtrStep_ query, const DevMem2D_ train, const Mask mask, PtrStepf distance) + template + __global__ void calcDistanceUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) { - __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; + extern __shared__ int smem[]; - typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; - - const int queryIdx = blockIdx.x; - const T* queryDescs = query.ptr(queryIdx); + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - if (trainIdx < train.rows) + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - const T* trainDescs = train.ptr(trainIdx); + const int loadX = threadIdx.x + i * BLOCK_SIZE; - typename Dist::result_type myDist = numeric_limits::max(); + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + if (queryIdx < query.rows && trainIdx < train.rows) + { + float distVal = numeric_limits::max(); if (mask(queryIdx, trainIdx)) - { - Dist dist; + distVal = (typename Dist::result_type)dist; - calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - - myDist = dist; - } - - if (threadIdx.x == 0) - distance.ptr(queryIdx)[trainIdx] = myDist; + allDist.ptr(queryIdx)[trainIdx] = distVal; } } - /////////////////////////////////////////////////////////////////////////////// - // Calc distance kernel caller - - template - void calcDistance_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) + template + void calcDistanceUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) { - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - calcDistance<<>>(query, train, mask, distance); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + calcDistanceUnrolled<<>>(query, train, mask, allDist); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) + template + __global__ void calcDistance(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) { - calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast(allDist), stream); + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + if (queryIdx < query.rows && trainIdx < train.rows) + { + float distVal = numeric_limits::max(); + + if (mask(queryIdx, trainIdx)) + distVal = (typename Dist::result_type)dist; + + allDist.ptr(queryIdx)[trainIdx] = distVal; + } + } + + template + void calcDistance(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + calcDistance<<>>(query, train, mask, allDist); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Calc Distance dispatcher + + template + void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Df& allDist, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 128) + { + calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 256) + { + calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 512) + { + calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 1024) + { + calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); + } + else + { + calcDistance<16, Dist>(query, train, mask, allDist, stream); + } } /////////////////////////////////////////////////////////////////////////////// // find knn match kernel - template __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) + template + __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) { const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; - __shared__ float sdist[SMEM_SIZE]; - __shared__ int strainIdx[SMEM_SIZE]; + __shared__ float s_dist[SMEM_SIZE]; + __shared__ int s_trainIdx[SMEM_SIZE]; const int queryIdx = blockIdx.x; - float* allDist = allDist_.ptr(queryIdx); - int* trainIdx = trainIdx_.ptr(queryIdx); - float* distance = distance_.ptr(queryIdx); + float* allDistRow = allDist.ptr(queryIdx); float dist = numeric_limits::max(); int bestIdx = -1; - for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) + for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) { - float reg = allDist[i]; + float reg = allDistRow[i]; if (reg < dist) { dist = reg; @@ -346,34 +982,32 @@ namespace cv { namespace gpu { namespace bf_knnmatch } } - sdist[threadIdx.x] = dist; - strainIdx[threadIdx.x] = bestIdx; + s_dist[threadIdx.x] = dist; + s_trainIdx[threadIdx.x] = bestIdx; __syncthreads(); - reducePredVal(sdist, dist, strainIdx, bestIdx, threadIdx.x, less()); + reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { if (dist < numeric_limits::max()) { - allDist[bestIdx] = numeric_limits::max(); - trainIdx[i] = bestIdx; - distance[i] = dist; + allDistRow[bestIdx] = numeric_limits::max(); + trainIdx.ptr(queryIdx)[i] = bestIdx; + distance.ptr(queryIdx)[i] = dist; } } } - - /////////////////////////////////////////////////////////////////////////////// - // find knn match kernel caller - template void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + template + void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { - const dim3 threads(BLOCK_SIZE, 1, 1); + const dim3 block(BLOCK_SIZE, 1, 1); const dim3 grid(trainIdx.rows, 1, 1); for (int i = 0; i < k; ++i) { - findBestMatch<<>>(allDist, i, trainIdx, distance); + findBestMatch<<>>(allDist, i, trainIdx, distance); cudaSafeCall( cudaGetLastError() ); } @@ -381,84 +1015,130 @@ namespace cv { namespace gpu { namespace bf_knnmatch cudaSafeCall( cudaDeviceSynchronize() ); } - void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) + void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) { - findKnnMatch_caller<256>(k, static_cast(trainIdx), static_cast(distance), static_cast(allDist), stream); + findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); } - + /////////////////////////////////////////////////////////////////////////////// // knn match Dispatcher - template - void knnMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) { - if (mask.data) + if (k == 2) { - if (k == 2) - { - knnMatch2Dispatcher(query, train, SingleMask(mask), trainIdx, distance, cc, stream); - return; - } - - calcDistanceDispatcher(query, train, SingleMask(mask), allDist, stream); + match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); } else { - if (k == 2) - { - knnMatch2Dispatcher(query, train, WithOutMask(), trainIdx, distance, cc, stream); - return; - } - - calcDistanceDispatcher(query, train, WithOutMask(), allDist, stream); + calcDistanceDispatcher(query, train, mask, allDist, cc, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); } - - findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); - } + } /////////////////////////////////////////////////////////////////////////////// // knn match caller - template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) { - knnMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + if (mask.data) + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) { - knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } - //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) { - knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - //template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + //template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2Di& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } + + template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); }}} diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index d2bb120a48..a2e1923f47 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -49,355 +49,715 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace bf_match { - template - __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) + /////////////////////////////////////////////////////////////////////////////// + // Reduction + + template + __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) { - if (threadIdx.x == 0) - { - smin[threadIdx.y] = myDist; - sIdx[threadIdx.y] = myIdx; - } + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; + __syncthreads(); - reducePredVal(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less()); + reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } - template - __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& train, const Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) + template + __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) { - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; + + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; + s_imgIdx [threadIdx.x] = bestImgIdx; + + __syncthreads(); + + reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled Cached + + template + __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) + { + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - if (m(queryIdx, trainIdx)) + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; + } + } + + template + __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - const T* trainDescs = train.ptr(trainIdx); + const int loadX = threadIdx.x + i * BLOCK_SIZE; - Dist dist; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0; - vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); + __syncthreads(); - const typename Dist::result_type res = dist; + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - if (res < myDist) - { - myDist = res; - myIdx.x = trainIdx; - myIdx.y = imgIdx; - } + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; } } } - template struct SingleTrain + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) { - explicit SingleTrain(const DevMem2D_& train_) : train(train_) + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + + loadQueryToSmem(queryIdx, query, s_query); + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; } + } - template - __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const - { - matchDescs(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); - } - - __device__ __forceinline__ int desc_len() const - { - return train.cols; - } - - static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, - float myDist, const int2& myIdx, int queryIdx) - { - trainIdx[queryIdx] = myIdx.x; - distance[queryIdx] = myDist; - } - - const DevMem2D_ train; - }; - - template struct TrainCollection + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) { - TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : - trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) - { - } + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - template - __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const - { - for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) - { - const DevMem2D_ train = trainCollection[imgIdx]; - m.next(); - matchDescs(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); - } - } + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - __device__ __forceinline__ int desc_len() const - { - return desclen; - } + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, - float myDist, const int2& myIdx, int queryIdx) - { - trainIdx[queryIdx] = myIdx.x; - imgIdx[queryIdx] = myIdx.y; - distance[queryIdx] = myDist; - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - const DevMem2D_* trainCollection; - const int nImg; - const int desclen; - }; - - template - __device__ void distanceCalcLoop(const PtrStep_& query, const Train& train, const Mask& mask, int queryIdx, - typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { - const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); - - typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + + loadQueryToSmem(queryIdx, query, s_query); + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; Mask m = mask; - myIdx.x = -1; - myIdx.y = -1; - myDist = numeric_limits::max(); + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - train.template loop(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); - } - - template - __global__ void match(const PtrStep_ query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) - { - __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - const int queryIdx = blockIdx.x; - - int2 myIdx; - typename Dist::result_type myDist; - - distanceCalcLoop(query, train, mask, queryIdx, myDist, myIdx, smem); __syncthreads(); - typename Dist::result_type* smin = smem; - int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myDist, myIdx, smin, sIdx); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); - if (threadIdx.x == 0 && threadIdx.y == 0) - Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; + } } - /////////////////////////////////////////////////////////////////////////////// - // Match kernel caller - - template - void matchSimple_caller(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) { - StaticAssert::check(); // blockDimY vals must reduce by warp + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - match, Dist, T> - <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void matchCached_caller(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled + + template + __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) { - StaticAssert::check(); // blockDimY vals must reduce by warp - StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - match, Dist, T> - <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; + } + } + } + + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } + } + + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - - /////////////////////////////////////////////////////////////////////////////// - // Match Dispatcher - template - void matchDispatcher(const DevMem2D_& query, const Train& train, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { - if (query.cols < 64) + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) { - matchCached_caller<16, 16, 64, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); } - else if (query.cols == 64) + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - matchCached_caller<16, 16, 64, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; } - else if (query.cols < 128) + } + + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match + + template + __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - matchCached_caller<16, 16, 128, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; + } } - else if (query.cols == 128 && cc >= 12) + } + + template + __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - matchCached_caller<16, 16, 128, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; } - else if (query.cols < 256 && cc >= 12) + } + + template + void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) + { + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + Mask m = mask; + for (int imgIdx = 0; imgIdx < n; ++imgIdx) { - matchCached_caller<16, 16, 256, false, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); } - else if (query.cols == 256 && cc >= 12) + + __syncthreads(); + + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) { - matchCached_caller<16, 16, 256, true, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; + } + } + + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match dispatcher + + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); } else { - matchSimple_caller<16, 16, Dist>( - query, train, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), - stream); + match<16, Dist>(query, train, mask, trainIdx, distance, stream); } } - + + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else + { + match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + } + /////////////////////////////////////////////////////////////////////////////// // Match caller - template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } } - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } } - //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } } - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + if (masks.data) + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } } - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } } - //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, - int cc, cudaStream_t stream) + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - if (maskCollection.data) - matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } else - matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } } - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - //template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); }}} diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 1c1dace75b..58b2c8a984 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -49,466 +49,410 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace bf_radius_match { - template struct SingleTrain + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled + + template + __global__ void matchUnrolled(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - enum {USE_IMG_IDX = 0}; + #if __CUDA_ARCH__ >= 110 - explicit SingleTrain(const DevMem2D_& train_) : train(train_) + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - } + const int loadX = threadIdx.x + i * BLOCK_SIZE; - static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd, - int* trainIdx, int* imgIdx, float* distance, int maxCount) - { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < s_count && s_globInd + tid < maxCount) + if (loadX < query.cols) { - trainIdx[s_globInd + tid] = s_trainIdx[tid]; - distance[s_globInd + tid] = s_dist[tid]; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; } - if (tid == 0) + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + float distVal = (typename Dist::result_type)dist; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) { - s_globInd += s_count; - s_count = 0; + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; } } - template - __device__ __forceinline__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff, - int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd, - int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount, - typename Dist::result_type* s_diffRow) const - { - #if __CUDA_ARCH__ >= 120 - - for (int i = 0; i < train.rows; i += blockDim.y) - { - int trainIdx = i + threadIdx.y; - - if (trainIdx < train.rows && mask(blockIdx.x, trainIdx)) - { - Dist dist; - - vecDiff.calc(train.ptr(trainIdx), train.cols, dist, s_diffRow, threadIdx.x); - - const typename Dist::result_type val = dist; - - if (threadIdx.x == 0 && val < maxDistance) - { - unsigned int ind = atomicInc(&s_count, (unsigned int) -1); - s_trainIdx[ind] = trainIdx; - s_dist[ind] = val; - } - } - - __syncthreads(); - - if (s_count >= BLOCK_STACK - blockDim.y) - store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); - - __syncthreads(); - } - - store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); - - #endif - } - - __device__ __forceinline__ int descLen() const - { - return train.cols; - } - - const DevMem2D_ train; - }; - - template struct TrainCollection - { - enum {USE_IMG_IDX = 1}; - - TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : - trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) - { - } - - static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd, - int* trainIdx, int* imgIdx, float* distance, int maxCount) - { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < s_count && s_globInd + tid < maxCount) - { - trainIdx[s_globInd + tid] = s_trainIdx[tid]; - imgIdx[s_globInd + tid] = s_imgIdx[tid]; - distance[s_globInd + tid] = s_dist[tid]; - } - - if (tid == 0) - { - s_globInd += s_count; - s_count = 0; - } - } - - template - __device__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff, - int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd, - int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount, - typename Dist::result_type* s_diffRow) const - { - #if __CUDA_ARCH__ >= 120 - - for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) - { - const DevMem2D_ train = trainCollection[imgIdx]; - - mask.next(); - - for (int i = 0; i < train.rows; i += blockDim.y) - { - int trainIdx = i + threadIdx.y; - - if (trainIdx < train.rows && mask(blockIdx.x, trainIdx)) - { - Dist dist; - - vecDiff.calc(train.ptr(trainIdx), desclen, dist, s_diffRow, threadIdx.x); - - const typename Dist::result_type val = dist; - - if (threadIdx.x == 0 && val < maxDistance) - { - unsigned int ind = atomicInc(&s_count, (unsigned int) -1); - s_trainIdx[ind] = trainIdx; - s_imgIdx[ind] = imgIdx; - s_dist[ind] = val; - } - } - - __syncthreads(); - - if (s_count >= BLOCK_STACK - blockDim.y) - store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); - - __syncthreads(); - } - } - - store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount); - - #endif - } - - __device__ __forceinline__ int descLen() const - { - return desclen; - } - - const DevMem2D_* trainCollection; - const int nImg; - const int desclen; - }; - - template - __global__ void radiusMatch(const PtrStep_ query, const Train train, float maxDistance, const Mask mask, - PtrStepi trainIdx, PtrStepi imgIdx, PtrStepf distance, int* nMatches, int maxCount) - { - typedef typename Dist::result_type result_type; - typedef typename Dist::value_type value_type; - - __shared__ result_type s_mem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - __shared__ int s_trainIdx[BLOCK_STACK]; - __shared__ int s_imgIdx[Train::USE_IMG_IDX ? BLOCK_STACK : 1]; - __shared__ float s_dist[BLOCK_STACK]; - __shared__ unsigned int s_count; - - __shared__ int s_globInd; - - if (threadIdx.x == 0 && threadIdx.y == 0) - { - s_count = 0; - s_globInd = 0; - } - __syncthreads(); - - const VecDiff vecDiff(query.ptr(blockIdx.x), train.descLen(), (typename Dist::value_type*)s_mem, threadIdx.y * BLOCK_DIM_X + threadIdx.x, threadIdx.x); - - Mask m = mask; - - train.template loop(maxDistance, m, vecDiff, - s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, - trainIdx.ptr(blockIdx.x), imgIdx.ptr(blockIdx.x), distance.ptr(blockIdx.x), maxCount, - s_mem + BLOCK_DIM_X * threadIdx.y); - - if (threadIdx.x == 0 && threadIdx.y == 0) - nMatches[blockIdx.x] = s_globInd; + #endif } - /////////////////////////////////////////////////////////////////////////////// - // Radius Match kernel caller - - template - void radiusMatchSimple_caller(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches, - cudaStream_t stream) + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) { - StaticAssert= BLOCK_DIM_Y>::check(); - StaticAssert::check(); + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - radiusMatch, Dist, T> - <<>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols); + matchUnrolled<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); - } + } - template - void radiusMatchCached_caller(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches, + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) { - StaticAssert= BLOCK_DIM_Y>::check(); - StaticAssert::check(); - StaticAssert= MAX_LEN>::check(); - StaticAssert::check(); + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(query.rows, 1, 1); - const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - radiusMatch, Dist, T> - <<>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols); - cudaSafeCall( cudaGetLastError() ); + for (int i = 0; i < n; ++i) + { + const DevMem2D_ train = trains[i]; + + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + if (masks != 0 && masks[i].data) + { + matchUnrolled<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + else + { + matchUnrolled<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + cudaSafeCall( cudaGetLastError() ); + } if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// - // Radius Match Dispatcher - - template - void radiusMatchDispatcher(const DevMem2D_& query, const Train& train, float maxDistance, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, + // Match + + template + __global__ void match(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) + { + #if __CUDA_ARCH__ >= 110 + + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + float distVal = (typename Dist::result_type)dist; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) + { + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; + } + } + + #endif + } + + template + void match(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) { - if (query.cols < 64) + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + match<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + for (int i = 0; i < n; ++i) { - radiusMatchCached_caller<16, 16, 64, 64, false, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + const DevMem2D_ train = trains[i]; + + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + if (masks != 0 && masks[i].data) + { + match<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + else + { + match<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + cudaSafeCall( cudaGetLastError() ); } - else if (query.cols == 64) + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match dispatcher + + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) { - radiusMatchCached_caller<16, 16, 64, 64, true, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - else if (query.cols < 128) + else if (query.cols <= 128) { - radiusMatchCached_caller<16, 16, 64, 128, false, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - else if (query.cols == 128) + else if (query.cols <= 256) { - radiusMatchCached_caller<16, 16, 64, 128, true, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - else if (query.cols < 256) - { - radiusMatchCached_caller<16, 16, 64, 256, false, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - else if (query.cols == 256) - { - radiusMatchCached_caller<16, 16, 64, 256, true, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } else { - radiusMatchSimple_caller<16, 16, 64, Dist>( - query, train, maxDistance, mask, - static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), (int*)nMatches.data, - stream); + match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - } + } + + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 128) + { + matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else + { + match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + } /////////////////////////////////////////////////////////////////////////////// // Radius Match caller - template void radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); } else { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); } } - template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); } else { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); } } - //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, float maxDistance, const DevMem2D& mask, - const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - SingleTrain train(static_cast< DevMem2D_ >(train_)); - if (mask.data) { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, SingleMask(mask), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); } else { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, DevMem2D(), distance, nMatches, - stream); + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); } } - template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - - if (maskCollection.data) - { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), - trainIdx, imgIdx, distance, nMatches, - stream); - } - else - { - radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches, - stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); } - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - - if (maskCollection.data) - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), - trainIdx, imgIdx, distance, nMatches, - stream); - } - else - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches, - stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); } - //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, - const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, - cudaStream_t stream) + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); - - if (maskCollection.data) - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, MaskCollection(maskCollection.data), - trainIdx, imgIdx, distance, nMatches, - stream); - } - else - { - radiusMatchDispatcher(static_cast< DevMem2D_ >(query), train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches, - stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); } - template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - //template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); - template void radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); }}} diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp index 229fe04e50..e7666627f7 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp @@ -47,6 +47,9 @@ namespace cv { namespace gpu { namespace device { namespace detail { + /////////////////////////////////////////////////////////////////////////////// + // Reduction + template struct WarpReductor { template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) @@ -209,6 +212,8 @@ namespace cv { namespace gpu { namespace device } }; + /////////////////////////////////////////////////////////////////////////////// + // PredValWarpReductor template struct PredValWarpReductor; template <> struct PredValWarpReductor<64> @@ -501,6 +506,335 @@ namespace cv { namespace gpu { namespace device } } }; + + /////////////////////////////////////////////////////////////////////////////// + // PredVal2WarpReductor + + template struct PredVal2WarpReductor; + template <> struct PredVal2WarpReductor<64> + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + if (tid < 32) + { + myData = sdata[tid]; + myVal1 = sval1[tid]; + myVal2 = sval2[tid]; + + T reg = sdata[tid + 32]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 32]; + sval2[tid] = myVal2 = sval2[tid + 32]; + } + + reg = sdata[tid + 16]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 16]; + sval2[tid] = myVal2 = sval2[tid + 16]; + } + + reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 8]; + sval2[tid] = myVal2 = sval2[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 4]; + sval2[tid] = myVal2 = sval2[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 2]; + sval2[tid] = myVal2 = sval2[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 1]; + sval2[tid] = myVal2 = sval2[tid + 1]; + } + } + } + }; + template <> struct PredVal2WarpReductor<32> + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + if (tid < 16) + { + myData = sdata[tid]; + myVal1 = sval1[tid]; + myVal2 = sval2[tid]; + + T reg = sdata[tid + 16]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 16]; + sval2[tid] = myVal2 = sval2[tid + 16]; + } + + reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 8]; + sval2[tid] = myVal2 = sval2[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 4]; + sval2[tid] = myVal2 = sval2[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 2]; + sval2[tid] = myVal2 = sval2[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 1]; + sval2[tid] = myVal2 = sval2[tid + 1]; + } + } + } + }; + + template <> struct PredVal2WarpReductor<16> + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + if (tid < 8) + { + myData = sdata[tid]; + myVal1 = sval1[tid]; + myVal2 = sval2[tid]; + + T reg = reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 8]; + sval2[tid] = myVal2 = sval2[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 4]; + sval2[tid] = myVal2 = sval2[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 2]; + sval2[tid] = myVal2 = sval2[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 1]; + sval2[tid] = myVal2 = sval2[tid + 1]; + } + } + } + }; + template <> struct PredVal2WarpReductor<8> + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + if (tid < 4) + { + myData = sdata[tid]; + myVal1 = sval1[tid]; + myVal2 = sval2[tid]; + + T reg = reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 4]; + sval2[tid] = myVal2 = sval2[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 2]; + sval2[tid] = myVal2 = sval2[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 1]; + sval2[tid] = myVal2 = sval2[tid + 1]; + } + } + } + }; + + template struct PredVal2ReductionDispatcher; + template <> struct PredVal2ReductionDispatcher + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + PredVal2WarpReductor::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); + } + }; + template <> struct PredVal2ReductionDispatcher + { + template + static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) + { + myData = sdata[tid]; + myVal1 = sval1[tid]; + myVal2 = sval2[tid]; + + if (n >= 512 && tid < 256) + { + T reg = sdata[tid + 256]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 256]; + sval2[tid] = myVal2 = sval2[tid + 256]; + } + __syncthreads(); + } + if (n >= 256 && tid < 128) + { + T reg = sdata[tid + 128]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 128]; + sval2[tid] = myVal2 = sval2[tid + 128]; + } + __syncthreads(); + } + if (n >= 128 && tid < 64) + { + T reg = sdata[tid + 64]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 64]; + sval2[tid] = myVal2 = sval2[tid + 64]; + } + __syncthreads(); + } + + if (tid < 32) + { + if (n >= 64) + { + T reg = sdata[tid + 32]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 32]; + sval2[tid] = myVal2 = sval2[tid + 32]; + } + } + if (n >= 32) + { + T reg = sdata[tid + 16]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 16]; + sval2[tid] = myVal2 = sval2[tid + 16]; + } + } + if (n >= 16) + { + T reg = sdata[tid + 8]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 8]; + sval2[tid] = myVal2 = sval2[tid + 8]; + } + } + if (n >= 8) + { + T reg = sdata[tid + 4]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 4]; + sval2[tid] = myVal2 = sval2[tid + 4]; + } + } + if (n >= 4) + { + T reg = sdata[tid + 2]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 2]; + sval2[tid] = myVal2 = sval2[tid + 2]; + } + } + if (n >= 2) + { + T reg = sdata[tid + 1]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval1[tid] = myVal1 = sval1[tid + 1]; + sval2[tid] = myVal2 = sval2[tid + 1]; + } + } + } + } + }; } }}} diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index 990158e637..678470d299 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device /////////////////////////////////////////////////////////////////////////////// // Reduction - // reduction template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { StaticAssert= 8 && n <= 512>::check(); @@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device StaticAssert= 8 && n <= 512>::check(); detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); } + + template + __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) + { + StaticAssert= 8 && n <= 512>::check(); + detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); + } /////////////////////////////////////////////////////////////////////////////// // Solve linear system diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp index cbe8b4dae8..9cc841dbc6 100644 --- a/modules/stitching/src/matchers.cpp +++ b/modules/stitching/src/matchers.cpp @@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat // Find 1->2 matches pair_matches.clear(); - matcher.knnMatch(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2); + matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2); matcher.knnMatchDownload(train_idx_, distance_, pair_matches); for (size_t i = 0; i < pair_matches.size(); ++i) { @@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat // Find 2->1 matches pair_matches.clear(); - matcher.knnMatch(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2); + matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2); matcher.knnMatchDownload(train_idx_, distance_, pair_matches); for (size_t i = 0; i < pair_matches.size(); ++i) { diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 512fc06ecb..132b76af4d 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -413,38 +413,55 @@ TEST(BruteForceMatcher) // Output vector< vector > matches(2); - vector< vector > d_matches(2); + gpu::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches; SUBTEST << "match"; + matcher.match(query, train, matches[0]); CPU_ON; matcher.match(query, train, matches[0]); CPU_OFF; + d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance); GPU_ON; - d_matcher.match(d_query, d_train, d_matches[0]); + d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance); GPU_OFF; - SUBTEST << "knnMatch"; - int knn = 2; + SUBTEST << "knnMatch, 2"; + matcher.knnMatch(query, train, matches, 2); CPU_ON; - matcher.knnMatch(query, train, matches, knn); + matcher.knnMatch(query, train, matches, 2); CPU_OFF; + d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2); GPU_ON; - d_matcher.knnMatch(d_query, d_train, d_matches, knn); + d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2); + GPU_OFF; + + SUBTEST << "knnMatch, 3"; + + matcher.knnMatch(query, train, matches, 3); + CPU_ON; + matcher.knnMatch(query, train, matches, 3); + CPU_OFF; + + d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3); + GPU_ON; + d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3); GPU_OFF; SUBTEST << "radiusMatch"; float max_distance = 2.0f; + matcher.radiusMatch(query, train, matches, max_distance); CPU_ON; matcher.radiusMatch(query, train, matches, max_distance); CPU_OFF; + d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance); GPU_ON; - d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance); + d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance); GPU_OFF; }