/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or bpied warranties, including, but not limited to, the bpied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #if !defined CUDA_DISABLER #include "internal_shared.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" namespace cv { namespace gpu { namespace device { namespace bf_match { /////////////////////////////////////////////////////////////////////////////// // Reduction template __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) { s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; s_distance[threadIdx.x] = bestDistance; s_trainIdx[threadIdx.x] = bestTrainIdx; __syncthreads(); reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } template __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) { s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE; s_distance[threadIdx.x] = bestDistance; s_trainIdx[threadIdx.x] = bestTrainIdx; s_imgIdx [threadIdx.x] = bestImgIdx; __syncthreads(); reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); } /////////////////////////////////////////////////////////////////////////////// // Match Unrolled Cached template __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz& query, U* s_query) { #pragma unroll for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; } } template __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz& query,volatile int imgIdx, const PtrStepSz& train, const Mask& mask, typename Dist::value_type* s_query, typename Dist::value_type* s_train, float& bestDistance, int& bestTrainIdx, int& bestImgIdx) { for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { Dist dist; #pragma unroll for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; if (loadX < train.cols) { T val; ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; } __syncthreads(); #pragma unroll for (int j = 0; j < BLOCK_SIZE; ++j) dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); __syncthreads(); } typename Dist::result_type distVal = dist; const int trainIdx = t * BLOCK_SIZE + threadIdx.x; if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) { bestImgIdx = imgIdx; bestDistance = distVal; bestTrainIdx = trainIdx; } } } template __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); loadQueryToSmem(queryIdx, query, s_query); float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestDistance[queryIdx] = myBestDistance; } } template void matchUnrolledCached(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); loadQueryToSmem(queryIdx, query, s_query); float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; int myBestImgIdx = -1; Mask m = mask; for (int imgIdx = 0; imgIdx < n; ++imgIdx) { const PtrStepSz train = trains[imgIdx]; m.next(); loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); } __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestImgIdx[queryIdx] = myBestImgIdx; bestDistance[queryIdx] = myBestDistance; } } template void matchUnrolledCached(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// // Match Unrolled template __device__ void loopUnrolled(int queryIdx, const PtrStepSz& query,volatile int imgIdx, const PtrStepSz& train, const Mask& mask, typename Dist::value_type* s_query, typename Dist::value_type* s_train, float& bestDistance, int& bestTrainIdx, int& bestImgIdx) { for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { Dist dist; #pragma unroll for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; if (loadX < query.cols) { T val; ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; } __syncthreads(); #pragma unroll for (int j = 0; j < BLOCK_SIZE; ++j) dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); __syncthreads(); } typename Dist::result_type distVal = dist; const int trainIdx = t * BLOCK_SIZE + threadIdx.x; if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) { bestImgIdx = imgIdx; bestDistance = distVal; bestTrainIdx = trainIdx; } } } template __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestDistance[queryIdx] = myBestDistance; } } template void matchUnrolled(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; int myBestImgIdx = -1; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); Mask m = mask; for (int imgIdx = 0; imgIdx < n; ++imgIdx) { const PtrStepSz train = trains[imgIdx]; m.next(); loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); } __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestImgIdx[queryIdx] = myBestImgIdx; bestDistance[queryIdx] = myBestDistance; } } template void matchUnrolled(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// // Match template __device__ void loop(int queryIdx, const PtrStepSz& query, volatile int imgIdx, const PtrStepSz& train, const Mask& mask, typename Dist::value_type* s_query, typename Dist::value_type* s_train, float& bestDistance, int& bestTrainIdx, int& bestImgIdx) { for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { Dist dist; for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; if (loadX < query.cols) { T val; ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; } __syncthreads(); #pragma unroll for (int j = 0; j < BLOCK_SIZE; ++j) dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); __syncthreads(); } typename Dist::result_type distVal = dist; const int trainIdx = t * BLOCK_SIZE + threadIdx.x; if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) { bestImgIdx = imgIdx; bestDistance = distVal; bestTrainIdx = trainIdx; } } } template __global__ void match(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestDistance[queryIdx] = myBestDistance; } } template void match(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); match<<>>(query, train, mask, trainIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template __global__ void match(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { extern __shared__ int smem[]; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; float myBestDistance = numeric_limits::max(); int myBestTrainIdx = -1; int myBestImgIdx = -1; typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); Mask m = mask; for (int imgIdx = 0; imgIdx < n; ++imgIdx) { const PtrStepSz train = trains[imgIdx]; m.next(); loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); } __syncthreads(); float* s_distance = (float*)(smem); int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); if (queryIdx < query.rows && threadIdx.x == 0) { bestTrainIdx[queryIdx] = myBestTrainIdx; bestImgIdx[queryIdx] = myBestImgIdx; bestDistance[queryIdx] = myBestDistance; } } template void match(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream) { const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////// // Match dispatcher template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); } else if (query.cols <= 128) { matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); } /*else if (query.cols <= 256) { matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); } else if (query.cols <= 512) { matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); } else if (query.cols <= 1024) { matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); }*/ else { match<16, Dist>(query, train, mask, trainIdx, distance, stream); } } template void matchDispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } else if (query.cols <= 128) { matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } /*else if (query.cols <= 256) { matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } else if (query.cols <= 512) { matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } else if (query.cols <= 1024) { matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); }*/ else { match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } } /////////////////////////////////////////////////////////////////////////////// // Match caller template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (mask.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, cc, stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, cc, stream); } } template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, cc, stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, cc, stream); } } //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, cc, stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, cc, stream); } } template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (masks.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } } template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } } //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } } template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); } // namespace bf_match }}} // namespace cv { namespace gpu { namespace device { #endif /* CUDA_DISABLER */