From 8274ed22e42c7b7c0ff6c05708208a3b6cb75259 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 31 Jan 2011 13:20:52 +0000 Subject: [PATCH] fixed gpu tests (BruteForceMatcher_GPU, divide, phase, cartToPolar, async) minor code refactoring --- modules/gpu/include/opencv2/gpu/gpu.hpp | 10 +- modules/gpu/src/brute_force_matcher.cpp | 38 +- modules/gpu/src/cuda/brute_force_matcher.cu | 597 ++++++++------------ modules/gpu/src/imgproc_gpu.cpp | 87 ++- tests/gpu/src/arithm.cpp | 41 +- tests/gpu/src/brute_force_matcher.cpp | 61 +- tests/gpu/src/gputest_main.cpp | 1 - tests/gpu/src/imgproc_gpu.cpp | 50 +- tests/gpu/src/operator_async_call.cpp | 151 ++--- 9 files changed, 460 insertions(+), 576 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index b719417765..2531d1e195 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -671,10 +671,12 @@ namespace cv //! output will have CV_32FC1 type CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect); - //! applies Canny edge detector and produces the edge map - //! supprots only CV_8UC1 source type - //! disabled until fix crash - CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + // applies Canny edge detector and produces the edge map + // disabled until fix crash + //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); //! computes Harris cornerness criteria at each image pixel CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 344712125c..4806e6705c 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -104,6 +104,18 @@ namespace cv { namespace gpu { namespace bfmatcher const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); }}} +namespace +{ + class ImgIdxSetter + { + public: + ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} + void operator()(DMatch& m) const {m.imgIdx = imgIdx;} + private: + int imgIdx; + }; +} + cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) { } @@ -185,7 +197,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, return; CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.size().area() == trainIdx.size().area()); + CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -309,8 +321,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, return; CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous()); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous()); + CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous() && imgIdx.cols == trainIdx.cols); + CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && imgIdx.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -390,7 +402,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con trainIdx.setTo(Scalar::all(-1)); distance.create(nQuery, k, CV_32F); - allDist.create(nQuery, nTrain, CV_32F); + ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); @@ -451,18 +463,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con knnMatchDownload(trainIdx, distance, matches, compactResult); } -namespace -{ - class ImgIdxSetter - { - public: - ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} - void operator()(DMatch& m) const {m.imgIdx = imgIdx;} - private: - int imgIdx; - }; -} - void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, vector< vector >& matches, int knn, const vector& masks, bool compactResult) { @@ -538,9 +538,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols); - CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery); + CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); - nMatches.create(1, nQuery, CV_32SC1); + ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); nMatches.setTo(Scalar::all(0)); if (trainIdx.empty()) { @@ -561,7 +561,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai return; CV_Assert(trainIdx.type() == CV_32SC1); - CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.size().area() == trainIdx.rows); + CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); const int nQuery = trainIdx.rows; diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index b28aee17ff..44f823d4d6 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -64,6 +64,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return mask.ptr(queryIdx)[trainIdx] != 0; } + private: PtrStep mask; }; @@ -82,6 +83,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0; } + private: PtrStep* maskCollection; PtrStep curMask; @@ -102,123 +104,99 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Reduce Sum - template - __device__ void reduceSum(float* sdiff, float mySum, int tid) - { - sdiff[tid] = mySum; - __syncthreads(); + template __device__ void reduceSum(float* sdiff_row, float& mySum); - if (BLOCK_DIM_X == 512) - { - if (tid < 256) - { - sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - if (BLOCK_DIM_X == 256) - { - if (tid < 128) - { - sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - if (BLOCK_DIM_X == 128) - { - if (tid < 64) - { - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } + template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum) + { + volatile float* smem = sdiff_row; + + smem[threadIdx.x] = mySum; - volatile float* smem = sdiff; - if (BLOCK_DIM_X == 64) + if (threadIdx.x < 8) { - if (tid < 32) - { - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 32) - { - if (tid < 16) - { - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 16) - { - if (tid < 8) - { - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 8) - { - if (tid < 4) - { - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 4) - { - if (tid < 2) - { - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 2) - { - if (tid < 1) - { - smem[tid] = mySum += smem[tid + 1]; - } + smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; } } + /////////////////////////////////////////////////////////////////////////////// + // Distance + + class L1Dist + { + public: + __device__ L1Dist() : mySum(0.0f) {} + + __device__ void reduceIter(float val1, float val2) + { + mySum += fabs(val1 - val2); + } + + template + __device__ void reduceAll(float* sdiff_row) + { + reduceSum(sdiff_row, mySum); + } + + __device__ operator float() const + { + return mySum; + } + + private: + float mySum; + }; + + class L2Dist + { + public: + __device__ L2Dist() : mySum(0.0f) {} + + __device__ void reduceIter(float val1, float val2) + { + float reg = val1 - val2; + mySum += reg * reg; + } + + template + __device__ void reduceAll(float* sdiff_row) + { + reduceSum(sdiff_row, mySum); + } + + __device__ operator float() const + { + return sqrtf(mySum); + } + + private: + float mySum; + }; + + /////////////////////////////////////////////////////////////////////////////// + // reduceDescDiff + + template + __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, + float* sdiff_row) + { + for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) + dist.reduceIter(queryDescs[i], trainDescs[i]); + + dist.reduceAll(sdiff_row); + } + +/////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////// Match ////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// // loadDescsVals - template - __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals) + template + __device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem) { const int tid = threadIdx.y * blockDim.x + threadIdx.x; @@ -237,111 +215,45 @@ namespace cv { namespace gpu { namespace bfmatcher } /////////////////////////////////////////////////////////////////////////////// - // Distance - - template - class L1Dist - { - public: - __device__ L1Dist() : mySum(0) {} - - __device__ void reduceIter(float val1, float val2) - { - mySum += fabs(val1 - val2); - } - - __device__ void reduceAll(float* sdiff, int tid) - { - reduceSum(sdiff, mySum, tid); - } - - static __device__ float finalResult(float res) - { - return res; - } - private: - float mySum; - }; - - template - class L2Dist - { - public: - __device__ L2Dist() : mySum(0) {} - - __device__ void reduceIter(float val1, float val2) - { - float reg = val1 - val2; - mySum += reg * reg; - } - - __device__ void reduceAll(float* sdiff, int tid) - { - reduceSum(sdiff, mySum, tid); - } - - static __device__ float finalResult(float res) - { - return sqrtf(res); - } - private: - float mySum; - }; - - /////////////////////////////////////////////////////////////////////////////// - // reduceDescDiff - - template - __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff) - { - const int tid = threadIdx.x; - - Dist dist; - - for (int i = tid; i < desc_len; i += BLOCK_DIM_X) - dist.reduceIter(queryDescs[i], trainDescs[i]); - - dist.reduceAll(sdiff, tid); - } - - /////////////////////////////////////////////////////////////////////////////// - // reduceDescDiff_smem + // reduceDescDiffCached template struct UnrollDescDiff { template - static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, - int ind, int desc_len) + static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, + Dist& dist, int ind) { if (ind < desc_len) + { dist.reduceIter(*queryVals, trainDescs[ind]); - ++queryVals; + ++queryVals; - UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len); + UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x); + } } template - static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) { dist.reduceIter(*queryVals, *trainDescs); ++queryVals; trainDescs += blockDim.x; - UnrollDescDiff::calcWithoutCheck(dist, queryVals, trainDescs); + UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs, dist); } }; template <> struct UnrollDescDiff<0> { template - static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, - int ind, int desc_len) + static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, + Dist& dist, int ind) { } template - static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) { } }; @@ -351,106 +263,82 @@ namespace cv { namespace gpu { namespace bfmatcher struct DescDiffCalculator { template - static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, - threadIdx.x, desc_len); + UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, + dist, threadIdx.x); } }; template struct DescDiffCalculator { template - static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcWithoutCheck(dist, queryVals, - trainDescs + threadIdx.x); + UnrollDescDiff::calcWithoutCheck(queryVals, + trainDescs + threadIdx.x, dist); } }; template - __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff) - { - const int tid = threadIdx.x; + __device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, + float* sdiff_row) + { + DescDiffCalculator::calc(queryVals, + trainDescs, desc_len, dist); - Dist dist; - - DescDiffCalculator::calc(dist, queryVals, - trainDescs, desc_len); - - dist.reduceAll(sdiff, tid); + dist.reduceAll(sdiff_row); } -/////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////// Match ////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - // warpReduceMin + // warpReduceMinIdxIdx template - __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx) - { - float minSum = sdata[tid]; + __device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx); - if (BLOCK_DIM_Y >= 64) + template <> + __device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + volatile float* smin, volatile int* strainIdx, volatile int* simgIdx) + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < 8) { - float reg = sdata[tid + 32]; - if (reg < minSum) + myMin = smin[tid]; + myBestTrainIdx = strainIdx[tid]; + myBestImgIdx = simgIdx[tid]; + + float reg = smin[tid + 8]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 32]; - simgIdx[tid] = simgIdx[tid + 32]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; } - } - if (BLOCK_DIM_Y >= 32) - { - float reg = sdata[tid + 16]; - if (reg < minSum) + + reg = smin[tid + 4]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 16]; - simgIdx[tid] = simgIdx[tid + 16]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; } - } - if (BLOCK_DIM_Y >= 16) - { - float reg = sdata[tid + 8]; - if (reg < minSum) + + reg = smin[tid + 2]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 8]; - simgIdx[tid] = simgIdx[tid + 8]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; } - } - if (BLOCK_DIM_Y >= 8) - { - float reg = sdata[tid + 4]; - if (reg < minSum) + + reg = smin[tid + 1]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 4]; - simgIdx[tid] = simgIdx[tid + 4]; - } - } - if (BLOCK_DIM_Y >= 4) - { - float reg = sdata[tid + 2]; - if (reg < minSum) - { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 2]; - simgIdx[tid] = simgIdx[tid + 2]; - } - } - if (BLOCK_DIM_Y >= 2) - { - float reg = sdata[tid + 1]; - if (reg < minSum) - { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 1]; - simgIdx[tid] = simgIdx[tid + 1]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; } } } @@ -458,9 +346,9 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // findBestMatch - template - __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, - float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance) + template + __device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + float* smin, int* strainIdx, int* simgIdx) { if (threadIdx.x == 0) { @@ -470,27 +358,13 @@ namespace cv { namespace gpu { namespace bfmatcher } __syncthreads(); - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < 32) - warpReduceMin(tid, smin, strainIdx, simgIdx); - - if (threadIdx.x == 0 && threadIdx.y == 0) - { - float minSum = smin[0]; - int bestTrainIdx = strainIdx[0]; - int bestImgIdx = simgIdx[0]; - - imgIdx[queryIdx] = bestImgIdx; - trainIdx[queryIdx] = bestTrainIdx; - distance[queryIdx] = Dist::finalResult(minSum); - } + warpReduceMinIdxIdx(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); } /////////////////////////////////////////////////////////////////////////////// // ReduceDescCalculator - template + template class ReduceDescCalculatorSimple { public: @@ -499,29 +373,30 @@ namespace cv { namespace gpu { namespace bfmatcher queryDescs = queryDescs_; } - __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + template + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const { - reduceDescDiff(queryDescs, trainDescs, desc_len, sdiff_row); + reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); } private: const T* queryDescs; }; - template - class ReduceDescCalculatorSmem + template + class ReduceDescCalculatorCached { public: __device__ void prepare(const T* queryDescs, int desc_len, float* smem) { - loadDescsVals(queryDescs, desc_len, smem, queryVals); + loadDescsVals(queryDescs, desc_len, queryVals, smem); } - __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + template + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const { - reduceDescDiff_smem(queryVals, trainDescs, - desc_len, sdiff_row); + reduceDescDiffCached(queryVals, trainDescs, + desc_len, dist, sdiff_row); } private: @@ -531,26 +406,26 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // matchDescs loop - template - __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_& trainDescs_, + template + __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& trainDescs_, const Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) { - const T* trainDescs = trainDescs_.ptr(threadIdx.y); - const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T); - for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; - trainIdx += blockDim.y, trainDescs += trainDescsStep) + for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y) { if (m(queryIdx, trainIdx)) { - reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row); + const T* trainDescs = trainDescs_.ptr(trainIdx); + + Dist dist; + + reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row); if (threadIdx.x == 0) { - float reg = sdiff_row[0]; - if (reg < myMin) + if (dist < myMin) { - myMin = reg; + myMin = dist; myBestTrainIdx = trainIdx; myBestImgIdx = imgIdx; } @@ -570,18 +445,19 @@ namespace cv { namespace gpu { namespace bfmatcher { } - template + template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const { - matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, - sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, + myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } __device__ int desc_len() const { return trainDescs.cols; } + private: DevMem2D_ trainDescs; }; @@ -595,16 +471,16 @@ namespace cv { namespace gpu { namespace bfmatcher { } - template + template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const { for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) { DevMem2D_ trainDescs = trainCollection[imgIdx]; m.nextMask(); - matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, - sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, + myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } } @@ -612,6 +488,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return desclen; } + private: const DevMem2D_* trainCollection; int nImg; @@ -623,12 +500,10 @@ namespace cv { namespace gpu { namespace bfmatcher template - __global__ void match(PtrStep_ queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance) + __global__ void match(const PtrStep_ queryDescs_, const Train train, const Mask mask, + int* trainIdx, int* imgIdx, float* distance) { - __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; - __shared__ float smin[64]; - __shared__ int strainIdx[64]; - __shared__ int simgIdx[64]; + __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; const int queryIdx = blockIdx.x; @@ -637,24 +512,39 @@ namespace cv { namespace gpu { namespace bfmatcher float myMin = numeric_limits_gpu::max(); { - float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; - Mask m = mask; - ReduceDescCalculator reduceDescCalc; - reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff); - - train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); - } + float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; - findBestMatch(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, - smin, strainIdx, simgIdx, trainIdx, imgIdx, distance); + Mask m = mask; + + ReduceDescCalculator reduceDescCalc; + + reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem); + + train.template loop(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); + } + __syncthreads(); + + float* smin = smem; + int* strainIdx = (int*)(smin + BLOCK_DIM_Y); + int* simgIdx = strainIdx + BLOCK_DIM_Y; + + findBestMatch(myMin, myBestTrainIdx, myBestImgIdx, + smin, strainIdx, simgIdx); + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + imgIdx[queryIdx] = myBestImgIdx; + trainIdx[queryIdx] = myBestTrainIdx; + distance[queryIdx] = myMin; + } } /////////////////////////////////////////////////////////////////////////////// // Match kernel callers - template class Dist, typename T, + template - void match_caller(const DevMem2D_& queryDescs, const Train& train, + void matchSimple_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { StaticAssert::check(); // blockDimY vals must reduce by warp @@ -662,15 +552,15 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(queryDescs.rows, 1, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, T>, - Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + match, Dist, T> + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaThreadSynchronize() ); } template class Dist, typename T, typename Train, typename Mask> - void match_smem_caller(const DevMem2D_& queryDescs, const Train& train, + typename Dist, typename T, typename Train, typename Mask> + void matchCached_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { StaticAssert::check(); // blockDimY vals must reduce by warp @@ -680,9 +570,10 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(queryDescs.rows, 1, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, T>, - Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + match, + Dist, T> + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaThreadSynchronize() ); @@ -691,24 +582,24 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Match kernel chooser - template