mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo)
added assertion after all kernels calls
This commit is contained in:
parent
5f175f9594
commit
deac5d972e
@ -435,8 +435,8 @@ namespace cv
|
||||
|
||||
void enqueueCopy(const GpuMat& src, GpuMat& dst);
|
||||
|
||||
void enqueueMemSet(const GpuMat& src, Scalar val);
|
||||
void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
|
||||
void enqueueMemSet(GpuMat& src, Scalar val);
|
||||
void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
|
||||
|
||||
// converts matrix type, ex from float to uchar depending on type
|
||||
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
|
||||
|
@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
template <typename T>
|
||||
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12);
|
||||
template <typename T>
|
||||
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12);
|
||||
template <typename T>
|
||||
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance);
|
||||
const DevMem2Df& distance,
|
||||
bool cc_12);
|
||||
template <typename T>
|
||||
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance);
|
||||
const DevMem2Df& distance,
|
||||
bool cc_12);
|
||||
|
||||
template <typename T>
|
||||
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
|
||||
using namespace cv::gpu::bfmatcher;
|
||||
|
||||
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12);
|
||||
|
||||
static const match_caller_t match_callers[2][8] =
|
||||
{
|
||||
{
|
||||
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>,
|
||||
matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
|
||||
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>,
|
||||
matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>,
|
||||
matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>,
|
||||
matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
|
||||
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>,
|
||||
matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>,
|
||||
matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
|
||||
}
|
||||
};
|
||||
|
||||
@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
|
||||
match_caller_t func = match_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
|
||||
|
||||
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
|
||||
// trainIdx store after imgIdx, so we doesn't lose it value.
|
||||
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);
|
||||
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12);
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
|
||||
@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
|
||||
|
||||
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance);
|
||||
const DevMem2Df& distance, bool cc_12);
|
||||
|
||||
static const match_caller_t match_callers[2][8] =
|
||||
{
|
||||
{
|
||||
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>,
|
||||
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
|
||||
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
|
||||
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>,
|
||||
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
|
||||
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
|
||||
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
|
||||
}
|
||||
@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
|
||||
match_caller_t func = match_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);
|
||||
bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
|
||||
|
||||
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,
|
||||
@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
|
||||
static const match_caller_t match_callers[2][8] =
|
||||
{
|
||||
{
|
||||
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>,
|
||||
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
|
||||
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>,
|
||||
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
|
||||
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
|
||||
}
|
||||
};
|
||||
@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
|
||||
static const radiusMatch_caller_t radiusMatch_callers[2][8] =
|
||||
{
|
||||
{
|
||||
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>,
|
||||
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
|
||||
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
|
||||
},
|
||||
{
|
||||
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>,
|
||||
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
|
||||
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
|
||||
}
|
||||
};
|
||||
|
@ -555,6 +555,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
|
||||
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
|
||||
imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -575,6 +576,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
Dist, T>
|
||||
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
|
||||
imgIdx.data, distance.data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -584,7 +586,8 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
template <typename Dist, typename T, typename Train, typename Mask>
|
||||
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train,
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
{
|
||||
if (queryDescs.cols < 64)
|
||||
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
@ -596,7 +599,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
else if (queryDescs.cols < 256)
|
||||
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
else if (queryDescs.cols == 256)
|
||||
else if (queryDescs.cols == 256 && cc_12)
|
||||
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
else
|
||||
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
@ -606,95 +609,99 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
template <typename T>
|
||||
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
{
|
||||
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
|
||||
if (mask.data)
|
||||
{
|
||||
SingleMask m(mask);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
else
|
||||
{
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
|
||||
template <typename T>
|
||||
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12)
|
||||
{
|
||||
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
|
||||
if (mask.data)
|
||||
{
|
||||
SingleMask m(mask);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
else
|
||||
{
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
|
||||
template <typename T>
|
||||
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
|
||||
if (maskCollection.data)
|
||||
{
|
||||
MaskCollection mask(maskCollection.data);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
else
|
||||
{
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
|
||||
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
|
||||
template <typename T>
|
||||
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12)
|
||||
{
|
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
|
||||
if (maskCollection.data)
|
||||
{
|
||||
MaskCollection mask(maskCollection.data);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
else
|
||||
{
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
|
||||
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
|
||||
}
|
||||
}
|
||||
|
||||
template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
|
||||
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////// Knn Match ////////////////////////////////////
|
||||
@ -748,6 +755,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
|
||||
queryDescs, trainDescs, mask, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -923,7 +931,10 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
dim3 grid(trainIdx.rows, 1, 1);
|
||||
|
||||
for (int i = 0; i < knn; ++i)
|
||||
{
|
||||
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -949,12 +960,12 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
|
||||
}
|
||||
|
||||
template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
|
||||
template <typename T>
|
||||
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
@ -974,12 +985,12 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
|
||||
}
|
||||
|
||||
template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////// Radius Match //////////////////////////////////
|
||||
@ -1044,6 +1055,7 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
|
||||
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -1067,12 +1079,12 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
|
||||
template <typename T>
|
||||
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
@ -1090,10 +1102,10 @@ namespace cv { namespace gpu { namespace bfmatcher
|
||||
}
|
||||
}
|
||||
|
||||
template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
|
||||
}}}
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vecmath.hpp"
|
||||
#include "opencv2/gpu/device/limits_gpu.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
@ -51,13 +52,9 @@ using namespace cv::gpu::device;
|
||||
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
|
||||
#endif
|
||||
|
||||
#ifndef FLT_EPSILON
|
||||
#define FLT_EPSILON 1.192092896e-07F
|
||||
#endif
|
||||
|
||||
namespace cv { namespace gpu { namespace color
|
||||
{
|
||||
template<typename T> struct ColorChannel {};
|
||||
template<typename T> struct ColorChannel;
|
||||
template<> struct ColorChannel<uchar>
|
||||
{
|
||||
typedef float worktype_f;
|
||||
@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color
|
||||
|
||||
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color
|
||||
vmin = fmin(vmin, b);
|
||||
|
||||
diff = v - vmin;
|
||||
s = diff / (float)(fabs(v) + FLT_EPSILON);
|
||||
diff = (float)(60. / (diff + FLT_EPSILON));
|
||||
s = diff / (float)(fabs(v) + numeric_limits_gpu<float>::epsilon());
|
||||
diff = (float)(60. / (diff + numeric_limits_gpu<float>::epsilon()));
|
||||
|
||||
if (v == r)
|
||||
h = (g - b) * diff;
|
||||
@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color
|
||||
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color
|
||||
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color
|
||||
diff = vmax - vmin;
|
||||
l = (vmax + vmin) * 0.5f;
|
||||
|
||||
if (diff > FLT_EPSILON)
|
||||
if (diff > numeric_limits_gpu<float>::epsilon())
|
||||
{
|
||||
s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
|
||||
diff = 60.f / diff;
|
||||
@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color
|
||||
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
|
||||
dst.data, dst.step, src.rows, src.cols, bidx);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
|
@ -130,6 +130,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
divUp(rows, threads.y));
|
||||
|
||||
bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -161,6 +162,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -251,6 +253,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
|
||||
|
||||
bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -283,7 +286,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
|
||||
bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -384,29 +388,71 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
};
|
||||
|
||||
struct ScalarMinOp
|
||||
template <typename T> struct ScalarMinOp
|
||||
{
|
||||
T s;
|
||||
|
||||
explicit ScalarMinOp(T s_) : s(s_) {}
|
||||
|
||||
__device__ T operator()(T a)
|
||||
{
|
||||
return min(a, s);
|
||||
}
|
||||
};
|
||||
template <> struct ScalarMinOp<float>
|
||||
{
|
||||
float s;
|
||||
|
||||
explicit ScalarMinOp(float s_) : s(s_) {}
|
||||
|
||||
__device__ float operator()(float a)
|
||||
{
|
||||
return fmin(a, s);
|
||||
}
|
||||
};
|
||||
template <> struct ScalarMinOp<double>
|
||||
{
|
||||
double s;
|
||||
|
||||
explicit ScalarMinOp(double s_) : s(s_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ T operator()(T a)
|
||||
__device__ double operator()(double a)
|
||||
{
|
||||
return saturate_cast<T>(fmin((double)a, s));
|
||||
return fmin(a, s);
|
||||
}
|
||||
};
|
||||
|
||||
struct ScalarMaxOp
|
||||
template <typename T> struct ScalarMaxOp
|
||||
{
|
||||
T s;
|
||||
|
||||
explicit ScalarMaxOp(T s_) : s(s_) {}
|
||||
|
||||
__device__ T operator()(T a)
|
||||
{
|
||||
return max(a, s);
|
||||
}
|
||||
};
|
||||
template <> struct ScalarMaxOp<float>
|
||||
{
|
||||
float s;
|
||||
|
||||
explicit ScalarMaxOp(float s_) : s(s_) {}
|
||||
|
||||
__device__ float operator()(float a)
|
||||
{
|
||||
return fmax(a, s);
|
||||
}
|
||||
};
|
||||
template <> struct ScalarMaxOp<double>
|
||||
{
|
||||
double s;
|
||||
|
||||
explicit ScalarMaxOp(double s_) : s(s_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ T operator()(T a)
|
||||
__device__ double operator()(double a)
|
||||
{
|
||||
return saturate_cast<T>(fmax((double)a, s));
|
||||
return fmax(a, s);
|
||||
}
|
||||
};
|
||||
|
||||
@ -418,7 +464,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
|
||||
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
|
||||
template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
|
||||
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
@ -433,7 +479,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
}
|
||||
|
||||
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
|
||||
template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
|
||||
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
@ -441,122 +487,145 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
ScalarMinOp op(src2);
|
||||
ScalarMinOp<T> op(src2);
|
||||
transform(src1, dst, op, stream);
|
||||
}
|
||||
|
||||
template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
|
||||
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
|
||||
template void min_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
|
||||
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void min_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
|
||||
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
ScalarMaxOp op(src2);
|
||||
ScalarMaxOp<T> op(src2);
|
||||
transform(src1, dst, op, stream);
|
||||
}
|
||||
|
||||
template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
|
||||
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
|
||||
template void max_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);
|
||||
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
|
||||
template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);
|
||||
template void max_gpu<int >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
|
||||
template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
|
||||
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// threshold
|
||||
|
||||
class ThreshOp
|
||||
template <typename T> struct ThreshBinary
|
||||
{
|
||||
public:
|
||||
ThreshOp(float thresh_, float maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
protected:
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return src > thresh ? maxVal : 0;
|
||||
}
|
||||
|
||||
private:
|
||||
T thresh;
|
||||
T maxVal;
|
||||
};
|
||||
|
||||
template <typename T> struct ThreshBinaryInv
|
||||
{
|
||||
ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return src > thresh ? 0 : maxVal;
|
||||
}
|
||||
|
||||
private:
|
||||
T thresh;
|
||||
T maxVal;
|
||||
};
|
||||
|
||||
template <typename T> struct ThreshTrunc
|
||||
{
|
||||
ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
|
||||
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return min(src, thresh);
|
||||
}
|
||||
|
||||
private:
|
||||
T thresh;
|
||||
};
|
||||
template <> struct ThreshTrunc<float>
|
||||
{
|
||||
ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
|
||||
|
||||
__device__ float operator()(const float& src) const
|
||||
{
|
||||
return fmin(src, thresh);
|
||||
}
|
||||
|
||||
private:
|
||||
float thresh;
|
||||
float maxVal;
|
||||
};
|
||||
template <> struct ThreshTrunc<double>
|
||||
{
|
||||
ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
|
||||
|
||||
__device__ double operator()(const double& src) const
|
||||
{
|
||||
return fmin(src, thresh);
|
||||
}
|
||||
|
||||
private:
|
||||
double thresh;
|
||||
};
|
||||
|
||||
class ThreshBinary : public ThreshOp
|
||||
template <typename T> struct ThreshToZero
|
||||
{
|
||||
public:
|
||||
ThreshBinary(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
|
||||
ThreshToZero(T thresh_, T) : thresh(thresh_) {}
|
||||
|
||||
template<typename T>
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return (float)src > thresh ? saturate_cast<T>(maxVal) : 0;
|
||||
return src > thresh ? src : 0;
|
||||
}
|
||||
|
||||
private:
|
||||
T thresh;
|
||||
};
|
||||
|
||||
class ThreshBinaryInv : public ThreshOp
|
||||
template <typename T> struct ThreshToZeroInv
|
||||
{
|
||||
public:
|
||||
ThreshBinaryInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
|
||||
ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}
|
||||
|
||||
template<typename T>
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return (float)src > thresh ? 0 : saturate_cast<T>(maxVal);
|
||||
return src > thresh ? 0 : src;
|
||||
}
|
||||
|
||||
private:
|
||||
T thresh;
|
||||
};
|
||||
|
||||
class ThreshTrunc : public ThreshOp
|
||||
{
|
||||
public:
|
||||
ThreshTrunc(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
|
||||
|
||||
template<typename T>
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return saturate_cast<T>(fmin((float)src, thresh));
|
||||
}
|
||||
};
|
||||
|
||||
class ThreshToZero : public ThreshOp
|
||||
{
|
||||
public:
|
||||
ThreshToZero(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
|
||||
|
||||
template<typename T>
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return (float)src > thresh ? src : 0;
|
||||
}
|
||||
};
|
||||
|
||||
class ThreshToZeroInv : public ThreshOp
|
||||
{
|
||||
public:
|
||||
ThreshToZeroInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
|
||||
|
||||
template<typename T>
|
||||
__device__ T operator()(const T& src) const
|
||||
{
|
||||
return (float)src > thresh ? 0 : src;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op, typename T>
|
||||
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal,
|
||||
template <template <typename> class Op, typename T>
|
||||
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
Op op(thresh, maxVal);
|
||||
Op<T> op(thresh, maxVal);
|
||||
transform(src, dst, op, stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
|
||||
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal,
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
@ -571,10 +640,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
|
||||
}
|
||||
|
||||
template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst, short thresh, short maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, int thresh, int maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<double>(const DevMem2D& src, const DevMem2D& dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
}}}
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vecmath.hpp"
|
||||
#include "opencv2/gpu/device/limits_gpu.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
#include "safe_call.hpp"
|
||||
#include "internal_shared.hpp"
|
||||
@ -51,192 +52,6 @@
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
namespace device
|
||||
{
|
||||
struct BrdReflect101
|
||||
{
|
||||
explicit BrdReflect101(int len): last(len - 1) {}
|
||||
|
||||
__device__ int idx_low(int i) const
|
||||
{
|
||||
return abs(i);
|
||||
}
|
||||
|
||||
__device__ int idx_high(int i) const
|
||||
{
|
||||
return last - abs(last - i);
|
||||
}
|
||||
|
||||
__device__ int idx(int i) const
|
||||
{
|
||||
return abs(idx_high(i));
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return -last <= mini && maxi <= 2 * last;
|
||||
}
|
||||
|
||||
int last;
|
||||
};
|
||||
template <typename D>
|
||||
struct BrdRowReflect101: BrdReflect101
|
||||
{
|
||||
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_low(i)]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_high(i)]);
|
||||
}
|
||||
};
|
||||
template <typename D>
|
||||
struct BrdColReflect101: BrdReflect101
|
||||
{
|
||||
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_low(i) * step]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_high(i) * step]);
|
||||
}
|
||||
|
||||
int step;
|
||||
};
|
||||
|
||||
struct BrdReplicate
|
||||
{
|
||||
explicit BrdReplicate(int len): last(len - 1) {}
|
||||
|
||||
__device__ int idx_low(int i) const
|
||||
{
|
||||
return max(i, 0);
|
||||
}
|
||||
|
||||
__device__ int idx_high(int i) const
|
||||
{
|
||||
return min(i, last);
|
||||
}
|
||||
|
||||
__device__ int idx(int i) const
|
||||
{
|
||||
return max(min(i, last), 0);
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
int last;
|
||||
};
|
||||
template <typename D>
|
||||
struct BrdRowReplicate: BrdReplicate
|
||||
{
|
||||
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_low(i)]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_high(i)]);
|
||||
}
|
||||
};
|
||||
template <typename D>
|
||||
struct BrdColReplicate: BrdReplicate
|
||||
{
|
||||
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_low(i) * step]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return saturate_cast<D>(data[idx_high(i) * step]);
|
||||
}
|
||||
int step;
|
||||
};
|
||||
|
||||
template <typename D>
|
||||
struct BrdRowConstant
|
||||
{
|
||||
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return i >= 0 ? saturate_cast<D>(data[i]) : val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return i < len ? saturate_cast<D>(data[i]) : val;
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
int len;
|
||||
D val;
|
||||
};
|
||||
template <typename D>
|
||||
struct BrdColConstant
|
||||
{
|
||||
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return i < len ? saturate_cast<D>(data[i * step]) : val;
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
int len;
|
||||
int step;
|
||||
D val;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Linear filters
|
||||
|
||||
@ -329,6 +144,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
}
|
||||
|
||||
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -467,6 +283,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
}
|
||||
|
||||
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -705,14 +522,18 @@ namespace cv { namespace gpu { namespace bf
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
@ -222,6 +222,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int smem = hists_size + final_hists_size;
|
||||
compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
|
||||
img_block_width, grad, qangle, scale, block_hists);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -325,6 +326,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
else
|
||||
cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -421,6 +424,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
|
||||
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
|
||||
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
|
||||
block_hists, coefs, free_coef, threshold, labels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -467,6 +472,8 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
|
||||
block_stride_x;
|
||||
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
|
||||
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -515,6 +522,8 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
|
||||
block_stride_x;
|
||||
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
|
||||
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -640,6 +649,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
|
||||
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
|
||||
height, width, img, angle_scale, grad, qangle);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -713,6 +724,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
|
||||
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
|
||||
height, width, img, angle_scale, grad, qangle);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -749,6 +762,8 @@ void resize_8UC4(const DevMem2D& src, DevMem2D dst)
|
||||
float sx = (float)src.cols / dst.cols;
|
||||
float sy = (float)src.rows / dst.rows;
|
||||
resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
|
||||
@ -776,6 +791,8 @@ void resize_8UC1(const DevMem2D& src, DevMem2D dst)
|
||||
float sx = (float)src.cols / dst.cols;
|
||||
float sy = (float)src.rows / dst.rows;
|
||||
resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));
|
||||
|
@ -137,6 +137,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
|
||||
|
||||
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture(tex_remap) );
|
||||
@ -150,6 +151,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
grid.y = divUp(dst.rows, threads.y);
|
||||
|
||||
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -259,6 +261,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
|
||||
|
||||
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
|
||||
}
|
||||
@ -273,6 +277,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
|
||||
|
||||
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
|
||||
}
|
||||
@ -388,6 +394,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -401,6 +408,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -451,6 +459,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
|
||||
|
||||
reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -491,6 +500,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
|
||||
|
||||
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -598,6 +609,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
break;
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaUnbindTexture(harrisDxTex));
|
||||
cudaSafeCall(cudaUnbindTexture(harrisDyTex));
|
||||
@ -712,6 +725,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
break;
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
|
||||
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
|
||||
@ -746,6 +761,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(src.cols, threads.x));
|
||||
|
||||
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -772,6 +789,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -799,6 +818,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -827,6 +848,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -855,6 +878,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
|
||||
|
||||
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
|
@ -132,6 +132,8 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
|
||||
templ.cols, templ.rows, image, templ, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -161,6 +163,8 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
|
||||
templ.cols, templ.rows, image, templ, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -222,6 +226,8 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
|
||||
templ.cols, templ.rows, image, templ, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -251,6 +257,8 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
|
||||
templ.cols, templ.rows, image, templ, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -299,6 +307,8 @@ void matchTemplatePrepared_SQDIFF_8U(
|
||||
w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -348,6 +358,8 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
|
||||
w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -378,6 +390,8 @@ void matchTemplatePrepared_CCOFF_8U(
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
|
||||
w, h, (float)templ_sum / (w * h), image_sum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -418,6 +432,8 @@ void matchTemplatePrepared_CCOFF_8UC2(
|
||||
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
|
||||
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
|
||||
image_sum_r, image_sum_g, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -472,6 +488,8 @@ void matchTemplatePrepared_CCOFF_8UC3(
|
||||
(float)templ_sum_g / (w * h),
|
||||
(float)templ_sum_b / (w * h),
|
||||
image_sum_r, image_sum_g, image_sum_b, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -536,6 +554,8 @@ void matchTemplatePrepared_CCOFF_8UC4(
|
||||
(float)templ_sum_a / (w * h),
|
||||
image_sum_r, image_sum_g, image_sum_b, image_sum_a,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -580,6 +600,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
|
||||
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
|
||||
w, h, weight, templ_sum_scale, templ_sqsum_scale,
|
||||
image_sum, image_sqsum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -641,6 +663,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
|
||||
image_sum_r, image_sqsum_r,
|
||||
image_sum_g, image_sqsum_g,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -716,6 +740,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
|
||||
image_sum_g, image_sqsum_g,
|
||||
image_sum_b, image_sqsum_b,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -805,6 +831,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
|
||||
image_sum_b, image_sqsum_b,
|
||||
image_sum_a, image_sqsum_a,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -847,6 +875,8 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
|
||||
normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
@ -887,6 +917,8 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
|
||||
extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
|
@ -150,6 +150,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -198,6 +199,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
@ -1,253 +1,272 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace matrix_operations {
|
||||
|
||||
template <typename T> struct shift_and_sizeof;
|
||||
template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// CopyTo /////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T>
|
||||
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat_dst[idx] = mat_src[idx];
|
||||
}
|
||||
}
|
||||
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||
|
||||
template<typename T>
|
||||
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(16,16, 1);
|
||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||
|
||||
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
static CopyToFunc tab[8] =
|
||||
{
|
||||
copy_to_with_mask_run<unsigned char>,
|
||||
copy_to_with_mask_run<char>,
|
||||
copy_to_with_mask_run<unsigned short>,
|
||||
copy_to_with_mask_run<short>,
|
||||
copy_to_with_mask_run<int>,
|
||||
copy_to_with_mask_run<float>,
|
||||
copy_to_with_mask_run<double>,
|
||||
0
|
||||
};
|
||||
|
||||
CopyToFunc func = tab[depth];
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat_src, mat_dst, mask, channels, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// SetTo //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__constant__ double scalar_d[4];
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = scalar_d[ x % channels ];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = scalar_d[ x % channels ];
|
||||
}
|
||||
}
|
||||
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
|
||||
|
||||
static SetToFunc_without_mask tab[8] =
|
||||
{
|
||||
set_to_without_mask_run<unsigned char>,
|
||||
set_to_without_mask_run<char>,
|
||||
set_to_without_mask_run<unsigned short>,
|
||||
set_to_without_mask_run<short>,
|
||||
set_to_without_mask_run<int>,
|
||||
set_to_without_mask_run<float>,
|
||||
set_to_without_mask_run<double>,
|
||||
0
|
||||
};
|
||||
|
||||
SetToFunc_without_mask func = tab[depth];
|
||||
|
||||
if (func == 0)
|
||||
cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat, channels, stream);
|
||||
}
|
||||
|
||||
void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
|
||||
|
||||
static SetToFunc_with_mask tab[8] =
|
||||
{
|
||||
set_to_with_mask_run<unsigned char>,
|
||||
set_to_with_mask_run<char>,
|
||||
set_to_with_mask_run<unsigned short>,
|
||||
set_to_with_mask_run<short>,
|
||||
set_to_with_mask_run<int>,
|
||||
set_to_with_mask_run<float>,
|
||||
set_to_with_mask_run<double>,
|
||||
0
|
||||
};
|
||||
|
||||
SetToFunc_with_mask func = tab[depth];
|
||||
|
||||
if (func == 0)
|
||||
cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat, mask, channels, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D>
|
||||
class Convertor
|
||||
{
|
||||
public:
|
||||
Convertor(double alpha_, double beta_): alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ D operator()(const T& src)
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
private:
|
||||
double alpha, beta;
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
|
||||
}
|
||||
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta,
|
||||
cudaStream_t stream);
|
||||
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace matrix_operations {
|
||||
|
||||
template <typename T> struct shift_and_sizeof;
|
||||
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// CopyTo /////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T>
|
||||
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat_dst[idx] = mat_src[idx];
|
||||
}
|
||||
}
|
||||
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||
|
||||
template<typename T>
|
||||
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(16,16, 1);
|
||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||
|
||||
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
static CopyToFunc tab[8] =
|
||||
{
|
||||
copy_to_with_mask_run<unsigned char>,
|
||||
copy_to_with_mask_run<signed char>,
|
||||
copy_to_with_mask_run<unsigned short>,
|
||||
copy_to_with_mask_run<short>,
|
||||
copy_to_with_mask_run<int>,
|
||||
copy_to_with_mask_run<float>,
|
||||
copy_to_with_mask_run<double>,
|
||||
0
|
||||
};
|
||||
|
||||
CopyToFunc func = tab[depth];
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat_src, mat_dst, mask, channels, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// SetTo //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__constant__ uchar scalar_8u[4];
|
||||
__constant__ schar scalar_8s[4];
|
||||
__constant__ ushort scalar_16u[4];
|
||||
__constant__ short scalar_16s[4];
|
||||
__constant__ int scalar_32s[4];
|
||||
__constant__ float scalar_32f[4];
|
||||
__constant__ double scalar_64f[4];
|
||||
|
||||
template <typename T> __device__ T readScalar(int i);
|
||||
template <> __device__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
|
||||
template <> __device__ schar readScalar<schar>(int i) {return scalar_8s[i];}
|
||||
template <> __device__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
|
||||
template <> __device__ short readScalar<short>(int i) {return scalar_16s[i];}
|
||||
template <> __device__ int readScalar<int>(int i) {return scalar_32s[i];}
|
||||
template <> __device__ float readScalar<float>(int i) {return scalar_32f[i];}
|
||||
template <> __device__ double readScalar<double>(int i) {return scalar_64f[i];}
|
||||
|
||||
void writeScalar(const uchar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
|
||||
}
|
||||
void writeScalar(const schar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
|
||||
}
|
||||
void writeScalar(const ushort* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
|
||||
}
|
||||
void writeScalar(const short* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
|
||||
}
|
||||
void writeScalar(const int* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
|
||||
}
|
||||
void writeScalar(const float* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
|
||||
}
|
||||
void writeScalar(const double* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = readScalar<T>(x % channels);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = readScalar<T>(x % channels);
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream)
|
||||
{
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream)
|
||||
{
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, int channels, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D>
|
||||
class Convertor
|
||||
{
|
||||
public:
|
||||
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ D operator()(const T& src)
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
private:
|
||||
double alpha, beta;
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
|
||||
}
|
||||
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t tab[8][8] =
|
||||
{
|
||||
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
|
||||
@ -272,12 +291,12 @@ namespace cv { namespace gpu { namespace matrix_operations {
|
||||
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
|
||||
|
||||
{0,0,0,0,0,0,0,0}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
caller_t func = tab[sdepth][ddepth];
|
||||
if (!func)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
}}}
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
}}}
|
||||
|
@ -273,6 +273,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -302,6 +304,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -355,7 +359,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -384,7 +391,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -597,6 +607,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -636,6 +648,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -706,7 +720,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -745,7 +762,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
|
||||
minloc_buf, maxloc_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
T minval_, maxval_;
|
||||
@ -873,6 +893,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
uint* count_buf = (uint*)buf.ptr(0);
|
||||
|
||||
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
uint count;
|
||||
@ -916,7 +938,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
uint* count_buf = (uint*)buf.ptr(0);
|
||||
|
||||
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
uint count;
|
||||
@ -1430,26 +1455,42 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
case 1:
|
||||
sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 2:
|
||||
sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 3:
|
||||
sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 4:
|
||||
sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -1500,6 +1541,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
@ -1534,26 +1577,42 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
case 1:
|
||||
sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 2:
|
||||
sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 3:
|
||||
sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 4:
|
||||
sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -1604,6 +1663,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
@ -1638,26 +1699,42 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
case 1:
|
||||
sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 2:
|
||||
sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 3:
|
||||
sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
case 4:
|
||||
sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
|
||||
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
break;
|
||||
}
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -1708,6 +1785,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
R result[4] = {0, 0, 0, 0};
|
||||
|
@ -233,6 +233,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -248,6 +250,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -264,6 +268,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
src[2].data, src[2].step,
|
||||
src[3].data, src[3].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -436,6 +442,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -451,6 +459,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
@ -467,6 +477,8 @@ namespace cv { namespace gpu { namespace split_merge {
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step,
|
||||
dst[3].data, dst[3].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
@ -325,6 +325,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
|
||||
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
|
||||
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
};
|
||||
@ -402,6 +404,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
|
||||
grid.y = divUp(input.rows, threads.y);
|
||||
|
||||
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -526,6 +529,7 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
|
||||
|
||||
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
|
||||
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
@ -172,6 +172,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -199,6 +201,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -212,6 +215,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -226,6 +230,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -239,6 +244,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -278,6 +284,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(dst_rows, threads.y);
|
||||
|
||||
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -321,9 +328,13 @@ namespace cv { namespace gpu { namespace bp
|
||||
int src_idx = (dst_idx + 1) & 1;
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -443,6 +454,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -505,6 +517,7 @@ namespace cv { namespace gpu { namespace bp
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
@ -382,6 +382,8 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
|
||||
|
||||
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
@ -395,6 +397,9 @@ namespace cv { namespace gpu { namespace csbp
|
||||
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
|
||||
else
|
||||
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -578,6 +583,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
|
||||
|
||||
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -700,10 +706,11 @@ namespace cv { namespace gpu { namespace csbp
|
||||
grid.y = divUp(h, threads.y);
|
||||
|
||||
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
|
||||
u_cur, d_cur, l_cur, r_cur,
|
||||
selected_disp_pyr_new, selected_disp_pyr_cur,
|
||||
data_cost_selected, data_cost,
|
||||
h, w, nr_plane, h2, w2, nr_plane2);
|
||||
u_cur, d_cur, l_cur, r_cur,
|
||||
selected_disp_pyr_new, selected_disp_pyr_cur,
|
||||
data_cost_selected, data_cost,
|
||||
h, w, nr_plane, h2, w2, nr_plane2);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -805,6 +812,7 @@ namespace cv { namespace gpu { namespace csbp
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -873,7 +881,9 @@ namespace cv { namespace gpu { namespace csbp
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
|
||||
disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
|
||||
disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
|
@ -238,6 +238,46 @@ namespace cv { namespace gpu { namespace surf
|
||||
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void fasthessian_old(PtrStepf hessianBuffer)
|
||||
{
|
||||
// Determine the indices in the Hessian buffer
|
||||
int gridDim_y = gridDim.y / c_nIntervals;
|
||||
int blockIdx_y = blockIdx.y % gridDim_y;
|
||||
int blockIdx_z = blockIdx.y / gridDim_y;
|
||||
|
||||
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
|
||||
int hidx_z = blockIdx_z;
|
||||
|
||||
float fscale = calcScale(hidx_z);
|
||||
|
||||
// Compute the lookup location of the mask center
|
||||
float x = hidx_x * c_step + c_border;
|
||||
float y = hidx_y * c_step + c_border;
|
||||
|
||||
// Scale the mask dimensions according to the scale
|
||||
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
|
||||
{
|
||||
float mask_width = c_mask_width * fscale;
|
||||
float mask_height = c_mask_height * fscale;
|
||||
|
||||
// Compute the filter responses
|
||||
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
|
||||
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
|
||||
float Dxy = evalDxy(x, y, fscale);
|
||||
|
||||
// Combine the responses and store the Laplacian sign
|
||||
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
|
||||
|
||||
if (Dxx + Dyy > 0.f)
|
||||
setLastBit(result);
|
||||
else
|
||||
clearLastBit(result);
|
||||
|
||||
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
|
||||
}
|
||||
}
|
||||
|
||||
dim3 calcBlockSize(int nIntervals)
|
||||
{
|
||||
@ -263,6 +303,21 @@ namespace cv { namespace gpu { namespace surf
|
||||
grid.y = divUp(y_size, threads.y);
|
||||
|
||||
fasthessian<<<grid, threads>>>(hessianBuffer);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
|
||||
dim3 grid;
|
||||
grid.x = divUp(x_size, threads.x);
|
||||
grid.y = divUp(y_size, threads.y) * threadsOld.z;
|
||||
|
||||
fasthessian_old<<<grid, threads>>>(hessianBuffer);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -395,6 +450,8 @@ namespace cv { namespace gpu { namespace surf
|
||||
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
|
||||
else
|
||||
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -574,6 +631,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
|
||||
|
||||
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
@ -715,6 +773,8 @@ namespace cv { namespace gpu { namespace surf
|
||||
grid.x = nFeatures;
|
||||
|
||||
find_orientation<<<grid, threads>>>(features);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
@ -987,17 +1047,255 @@ namespace cv { namespace gpu { namespace surf
|
||||
if (descriptors.cols == 64)
|
||||
{
|
||||
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
|
||||
{
|
||||
// get the interest point parameters (x, y, scale, strength, theta)
|
||||
__shared__ float ipt[5];
|
||||
if (tid < 5)
|
||||
{
|
||||
ipt[tid] = ((float*)&features[blockIdx.x])[tid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float sin_theta, cos_theta;
|
||||
sincosf(ipt[SF_ANGLE], &sin_theta, &cos_theta);
|
||||
|
||||
// Compute sampling points
|
||||
// since grids are 2D, need to compute xBlock and yBlock indices
|
||||
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
|
||||
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
|
||||
const int xIndex = xBlock * blockDim.x + threadIdx.x;
|
||||
const int yIndex = yBlock * blockDim.y + threadIdx.y;
|
||||
|
||||
// Compute rotated sampling points
|
||||
// (clockwise rotation since we are rotating the lattice)
|
||||
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
|
||||
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
|
||||
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
|
||||
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
|
||||
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
|
||||
|
||||
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
|
||||
// a b c
|
||||
// d f
|
||||
// g h i
|
||||
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
|
||||
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
|
||||
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
|
||||
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
|
||||
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
|
||||
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
|
||||
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
|
||||
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
|
||||
|
||||
// compute axis-aligned HaarX, HaarY
|
||||
// (could group the additions together into multiplications)
|
||||
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
|
||||
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
|
||||
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
|
||||
|
||||
// rotate responses (store all dxs then all dys)
|
||||
// - counterclockwise rotation to rotate back to zero orientation
|
||||
sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
|
||||
sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
|
||||
}
|
||||
|
||||
__device__ void reduce_sum_old(float sdata[25], int tid)
|
||||
{
|
||||
// first step is to reduce from 25 to 16
|
||||
if (tid < 9) // use 9 threads
|
||||
sdata[tid] += sdata[tid + 16];
|
||||
__syncthreads();
|
||||
|
||||
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
|
||||
if (tid < 16)
|
||||
{
|
||||
volatile float* smem = sdata;
|
||||
|
||||
smem[tid] += smem[tid + 8];
|
||||
smem[tid] += smem[tid + 4];
|
||||
smem[tid] += smem[tid + 2];
|
||||
smem[tid] += smem[tid + 1];
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn 16 blocks per interest point
|
||||
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
|
||||
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
|
||||
{
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
|
||||
|
||||
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
|
||||
__shared__ float sdx[25];
|
||||
__shared__ float sdy[25];
|
||||
|
||||
calc_dx_dy_old(sdx, sdy, features, tid);
|
||||
__syncthreads();
|
||||
|
||||
__shared__ float sabs[25];
|
||||
|
||||
sabs[tid] = fabs(sdx[tid]); // |dx| array
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum_old(sdx, tid);
|
||||
reduce_sum_old(sdy, tid);
|
||||
reduce_sum_old(sabs, tid);
|
||||
|
||||
// write dx, dy, |dx|
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[0] = sdx[0];
|
||||
descriptors_block[1] = sdy[0];
|
||||
descriptors_block[2] = sabs[0];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
sabs[tid] = fabs(sdy[tid]); // |dy| array
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum_old(sabs, tid);
|
||||
|
||||
// write |dy|
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[3] = sabs[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn 16 blocks per interest point
|
||||
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
|
||||
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
|
||||
{
|
||||
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
|
||||
__shared__ float sdx[25];
|
||||
__shared__ float sdy[25];
|
||||
|
||||
calc_dx_dy_old(sdx, sdy, features, tid);
|
||||
__syncthreads();
|
||||
|
||||
// sum (reduce) 5x5 area response
|
||||
__shared__ float sd1[25];
|
||||
__shared__ float sd2[25];
|
||||
__shared__ float sdabs1[25];
|
||||
__shared__ float sdabs2[25];
|
||||
|
||||
if (sdy[tid] >= 0)
|
||||
{
|
||||
sd1[tid] = sdx[tid];
|
||||
sdabs1[tid] = fabs(sdx[tid]);
|
||||
sd2[tid] = 0;
|
||||
sdabs2[tid] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sd1[tid] = 0;
|
||||
sdabs1[tid] = 0;
|
||||
sd2[tid] = sdx[tid];
|
||||
sdabs2[tid] = fabs(sdx[tid]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum_old(sd1, tid);
|
||||
reduce_sum_old(sd2, tid);
|
||||
reduce_sum_old(sdabs1, tid);
|
||||
reduce_sum_old(sdabs2, tid);
|
||||
|
||||
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[0] = sd1[0];
|
||||
descriptors_block[1] = sdabs1[0];
|
||||
descriptors_block[2] = sd2[0];
|
||||
descriptors_block[3] = sdabs2[0];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (sdx[tid] >= 0)
|
||||
{
|
||||
sd1[tid] = sdy[tid];
|
||||
sdabs1[tid] = fabs(sdy[tid]);
|
||||
sd2[tid] = 0;
|
||||
sdabs2[tid] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sd1[tid] = 0;
|
||||
sdabs1[tid] = 0;
|
||||
sd2[tid] = sdy[tid];
|
||||
sdabs2[tid] = fabs(sdy[tid]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
reduce_sum_old(sd1, tid);
|
||||
reduce_sum_old(sd2, tid);
|
||||
reduce_sum_old(sdabs1, tid);
|
||||
reduce_sum_old(sdabs2, tid);
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (tid == 0)
|
||||
{
|
||||
descriptors_block[4] = sd1[0];
|
||||
descriptors_block[5] = sdabs1[0];
|
||||
descriptors_block[6] = sd2[0];
|
||||
descriptors_block[7] = sdabs2[0];
|
||||
}
|
||||
}
|
||||
|
||||
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
|
||||
{
|
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
|
||||
if (descriptors.cols == 64)
|
||||
{
|
||||
compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
}
|
||||
|
@ -61,8 +61,8 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) {
|
||||
void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
|
||||
void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
@ -77,8 +77,10 @@ namespace cv
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
|
||||
void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}
|
||||
@ -99,6 +101,20 @@ namespace
|
||||
size_t bwidth = src.cols * src.elemSize();
|
||||
cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };
|
||||
@ -172,14 +188,26 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
|
||||
void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); }
|
||||
void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
|
||||
|
||||
void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)
|
||||
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
|
||||
{
|
||||
matrix_operations::set_to_without_mask(src, src.depth(), val.val, src.channels(), impl->stream);
|
||||
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
|
||||
static const set_caller_t set_callers[] =
|
||||
{
|
||||
kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
|
||||
kernelSet<int>, kernelSet<float>, kernelSet<double>
|
||||
};
|
||||
set_callers[src.depth()](src, val, impl->stream);
|
||||
}
|
||||
|
||||
void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
|
||||
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
|
||||
{
|
||||
matrix_operations::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream);
|
||||
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
|
||||
static const set_caller_t set_callers[] =
|
||||
{
|
||||
kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,
|
||||
kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>
|
||||
};
|
||||
set_callers[src.depth()](src, val, mask, impl->stream);
|
||||
}
|
||||
|
||||
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
|
||||
|
@ -585,10 +585,10 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
@ -605,7 +605,7 @@ namespace
|
||||
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
|
||||
mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -620,7 +620,7 @@ namespace
|
||||
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
|
||||
mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -629,7 +629,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<float>, min_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, 0);
|
||||
@ -640,7 +640,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<float>, min_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
|
||||
@ -651,7 +651,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
|
||||
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<float>, min_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, 0);
|
||||
@ -662,7 +662,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
|
||||
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
|
||||
min_caller<float>, min_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
|
||||
@ -673,7 +673,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<float>, max_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, 0);
|
||||
@ -684,7 +684,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
|
||||
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<float>, max_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
|
||||
@ -695,7 +695,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
|
||||
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<float>, max_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, 0);
|
||||
@ -706,7 +706,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
|
||||
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
|
||||
max_caller<float>, max_caller<double>
|
||||
};
|
||||
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
|
||||
@ -718,38 +718,17 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
{
|
||||
template <typename T>
|
||||
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
|
||||
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
|
||||
cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
|
||||
cudaStream_t stream = 0)
|
||||
cudaStream_t stream)
|
||||
{
|
||||
using namespace cv::gpu::mathfunc;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
threshold_gpu<unsigned char>, threshold_gpu<signed char>,
|
||||
threshold_gpu<unsigned short>, threshold_gpu<short>, threshold_gpu<int>, threshold_gpu<float>, 0
|
||||
};
|
||||
|
||||
CV_Assert(src.channels() == 1 && src.depth() < CV_64F);
|
||||
CV_Assert(type <= THRESH_TOZERO_INV);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (src.depth() != CV_32F)
|
||||
{
|
||||
thresh = cvFloor(thresh);
|
||||
maxVal = cvRound(maxVal);
|
||||
}
|
||||
|
||||
callers[src.depth()](src, dst, static_cast<float>(thresh), static_cast<float>(maxVal), type, stream);
|
||||
mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -770,7 +749,28 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
}
|
||||
else
|
||||
{
|
||||
threshold_caller(src, dst, thresh, maxVal, type);
|
||||
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
threshold_caller<unsigned char>, threshold_caller<signed char>,
|
||||
threshold_caller<unsigned short>, threshold_caller<short>,
|
||||
threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
|
||||
};
|
||||
|
||||
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
|
||||
CV_Assert(type <= THRESH_TOZERO_INV);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (src.depth() != CV_32F)
|
||||
{
|
||||
thresh = cvFloor(thresh);
|
||||
maxVal = cvRound(maxVal);
|
||||
}
|
||||
|
||||
callers[src.depth()](src, dst, thresh, maxVal, type, 0);
|
||||
}
|
||||
|
||||
return thresh;
|
||||
@ -778,7 +778,28 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
|
||||
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream)
|
||||
{
|
||||
threshold_caller(src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));
|
||||
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
threshold_caller<unsigned char>, threshold_caller<signed char>,
|
||||
threshold_caller<unsigned short>, threshold_caller<short>,
|
||||
threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
|
||||
};
|
||||
|
||||
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
|
||||
CV_Assert(type <= THRESH_TOZERO_INV);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (src.depth() != CV_32F)
|
||||
{
|
||||
thresh = cvFloor(thresh);
|
||||
maxVal = cvRound(maxVal);
|
||||
}
|
||||
|
||||
callers[src.depth()](src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));
|
||||
return thresh;
|
||||
}
|
||||
|
||||
|
@ -128,6 +128,8 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
|
||||
|
||||
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
|
||||
{
|
||||
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
|
||||
|
||||
if( src.empty() )
|
||||
CV_Error( CV_StsBadArg, "The input image is empty" );
|
||||
|
||||
@ -154,6 +156,8 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
|
||||
|
||||
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
|
||||
{
|
||||
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
|
||||
|
||||
if( src.empty() )
|
||||
CV_Error( CV_StsBadArg, "The input image is empty" );
|
||||
|
||||
|
@ -87,8 +87,10 @@ namespace cv
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
|
||||
void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}
|
||||
@ -363,9 +365,11 @@ namespace
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void kernelSet(GpuMat& src, const Scalar& s)
|
||||
{
|
||||
matrix_operations::set_to_without_mask(src, src.depth(), s.val, src.channels());
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
|
||||
}
|
||||
|
||||
template<int SDEPTH, int SCN> struct NppSetMaskFunc
|
||||
@ -412,9 +416,11 @@ namespace
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
|
||||
{
|
||||
matrix_operations::set_to_with_mask(src, src.depth(), s.val, mask, src.channels());
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -433,13 +439,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
|
||||
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s);
|
||||
static const set_caller_t set_callers[8][4] =
|
||||
{
|
||||
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet,kernelSet,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
|
||||
{kernelSet,kernelSet,kernelSet,kernelSet},
|
||||
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet,kernelSet,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
|
||||
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet,kernelSet,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
|
||||
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet,kernelSet,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
|
||||
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet,kernelSet,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
|
||||
{kernelSet,kernelSet,kernelSet,kernelSet},
|
||||
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
|
||||
{kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
|
||||
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
|
||||
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
|
||||
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
|
||||
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
|
||||
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},
|
||||
{0,0,0,0}
|
||||
};
|
||||
set_callers[depth()][channels()-1](*this, s);
|
||||
@ -449,13 +455,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
|
||||
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
|
||||
static const set_caller_t set_callers[8][4] =
|
||||
{
|
||||
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
|
||||
{kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
|
||||
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
|
||||
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
|
||||
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
|
||||
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
|
||||
{kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask},
|
||||
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
|
||||
{kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
|
||||
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
|
||||
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
|
||||
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
|
||||
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
|
||||
{kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>},
|
||||
{0,0,0,0}
|
||||
};
|
||||
set_callers[depth()][channels()-1](*this, s, mask);
|
||||
|
@ -227,6 +227,8 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
|
||||
|
||||
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
|
||||
{
|
||||
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
|
||||
|
||||
CV_Assert(src.type() == CV_8UC4);
|
||||
const int nrows = src.rows;
|
||||
const int ncols = src.cols;
|
||||
|
@ -40,6 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vecmath.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
@ -48,7 +51,7 @@ namespace cv
|
||||
{
|
||||
struct BrdReflect101
|
||||
{
|
||||
BrdReflect101(int len): last(len - 1) {}
|
||||
explicit BrdReflect101(int len): last(len - 1) {}
|
||||
|
||||
__device__ int idx_low(int i) const
|
||||
{
|
||||
@ -62,7 +65,7 @@ namespace cv
|
||||
|
||||
__device__ int idx(int i) const
|
||||
{
|
||||
return abs(idx_high(i));
|
||||
return idx_low(idx_high(i));
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
@ -70,49 +73,55 @@ namespace cv
|
||||
return -last <= mini && maxi <= 2 * last;
|
||||
}
|
||||
|
||||
private:
|
||||
int last;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
template <typename D>
|
||||
struct BrdRowReflect101: BrdReflect101
|
||||
{
|
||||
BrdRowReflect101(int len): BrdReflect101(len) {}
|
||||
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
|
||||
|
||||
__device__ float at_low(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return data[idx_low(i)];
|
||||
return saturate_cast<D>(data[idx_low(i)]);
|
||||
}
|
||||
|
||||
__device__ float at_high(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return data[idx_high(i)];
|
||||
return saturate_cast<D>(data[idx_high(i)]);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
template <typename D>
|
||||
struct BrdColReflect101: BrdReflect101
|
||||
{
|
||||
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
|
||||
|
||||
__device__ float at_low(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return data[idx_low(i) * step];
|
||||
return saturate_cast<D>(data[idx_low(i) * step]);
|
||||
}
|
||||
|
||||
__device__ float at_high(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return data[idx_high(i) * step];
|
||||
return saturate_cast<D>(data[idx_high(i) * step]);
|
||||
}
|
||||
|
||||
private:
|
||||
int step;
|
||||
};
|
||||
|
||||
|
||||
struct BrdReplicate
|
||||
{
|
||||
BrdReplicate(int len): last(len - 1) {}
|
||||
explicit BrdReplicate(int len): last(len - 1) {}
|
||||
|
||||
__device__ int idx_low(int i) const
|
||||
{
|
||||
@ -126,7 +135,7 @@ namespace cv
|
||||
|
||||
__device__ int idx(int i) const
|
||||
{
|
||||
return max(min(i, last), 0);
|
||||
return idx_low(idx_high(i));
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
@ -134,43 +143,105 @@ namespace cv
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
int last;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
template <typename D>
|
||||
struct BrdRowReplicate: BrdReplicate
|
||||
{
|
||||
BrdRowReplicate(int len): BrdReplicate(len) {}
|
||||
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
|
||||
|
||||
__device__ float at_low(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return data[idx_low(i)];
|
||||
return saturate_cast<D>(data[idx_low(i)]);
|
||||
}
|
||||
|
||||
__device__ float at_high(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return data[idx_high(i)];
|
||||
return saturate_cast<D>(data[idx_high(i)]);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
template <typename D>
|
||||
struct BrdColReplicate: BrdReplicate
|
||||
{
|
||||
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
|
||||
|
||||
__device__ float at_low(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return data[idx_low(i) * step];
|
||||
return saturate_cast<D>(data[idx_low(i) * step]);
|
||||
}
|
||||
|
||||
__device__ float at_high(int i, const T* data) const
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return data[idx_high(i) * step];
|
||||
return saturate_cast<D>(data[idx_high(i) * step]);
|
||||
}
|
||||
|
||||
private:
|
||||
int step;
|
||||
};
|
||||
|
||||
template <typename D>
|
||||
struct BrdRowConstant
|
||||
{
|
||||
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return i >= 0 ? saturate_cast<D>(data[i]) : val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return i < len ? saturate_cast<D>(data[i]) : val;
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
int len;
|
||||
D val;
|
||||
};
|
||||
|
||||
template <typename D>
|
||||
struct BrdColConstant
|
||||
{
|
||||
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_low(int i, const T* data) const
|
||||
{
|
||||
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ D at_high(int i, const T* data) const
|
||||
{
|
||||
return i < len ? saturate_cast<D>(data[i * step]) : val;
|
||||
}
|
||||
|
||||
bool is_range_safe(int mini, int maxi) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
int len;
|
||||
int step;
|
||||
D val;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
@ -329,6 +329,7 @@ namespace cv
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -345,6 +346,7 @@ namespace cv
|
||||
grid.y = divUp(src1.rows, threads.y);
|
||||
|
||||
device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -365,6 +367,7 @@ namespace cv
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
@ -383,6 +386,7 @@ namespace cv
|
||||
grid.y = divUp(src1.rows, threads.y);
|
||||
|
||||
device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
|
@ -65,6 +65,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
dim3 calcBlockSize(int nIntervals);
|
||||
|
||||
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
|
||||
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);
|
||||
|
||||
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
|
||||
int x_size, int y_size, bool use_mask, const dim3& threads);
|
||||
@ -75,6 +76,7 @@ namespace cv { namespace gpu { namespace surf
|
||||
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
|
||||
|
||||
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
|
||||
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
|
||||
}}}
|
||||
|
||||
using namespace cv::gpu::surf;
|
||||
@ -170,6 +172,10 @@ namespace
|
||||
|
||||
void detectKeypoints(GpuMat& keypoints)
|
||||
{
|
||||
typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
|
||||
const fasthessian_t fasthessian =
|
||||
DeviceInfo().supports(COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
|
||||
|
||||
dim3 threads = calcBlockSize(nIntervals);
|
||||
for(int octave = 0; octave < nOctaves; ++octave)
|
||||
{
|
||||
@ -192,7 +198,7 @@ namespace
|
||||
uploadConstant("cv::gpu::surf::c_border", border);
|
||||
uploadConstant("cv::gpu::surf::c_step", step);
|
||||
|
||||
fasthessian_gpu(hessianBuffer, x_size, y_size, threads);
|
||||
fasthessian(hessianBuffer, x_size, y_size, threads);
|
||||
|
||||
// Reset the candidate count.
|
||||
maxCounter = 0;
|
||||
@ -201,10 +207,13 @@ namespace
|
||||
|
||||
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
|
||||
|
||||
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
|
||||
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
|
||||
if (maxCounter > 0)
|
||||
{
|
||||
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
|
||||
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
|
||||
|
||||
featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
|
||||
featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
|
||||
}
|
||||
}
|
||||
|
||||
if (featureCounter > 0)
|
||||
@ -221,10 +230,16 @@ namespace
|
||||
|
||||
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
|
||||
{
|
||||
typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors,
|
||||
const KeyPoint_GPU* features, int nFeatures);
|
||||
|
||||
const compute_descriptors_t compute_descriptors =
|
||||
DeviceInfo().supports(COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
|
||||
|
||||
if (keypoints.cols > 0)
|
||||
{
|
||||
descriptors.create(keypoints.cols, descriptorSize, CV_32F);
|
||||
compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
|
||||
compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -384,6 +384,14 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
|
||||
|
||||
void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
|
||||
{
|
||||
bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
|
||||
if (!atomics_ok)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
|
||||
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
|
||||
return;
|
||||
}
|
||||
|
||||
dmatcher.clear();
|
||||
// test const version of match()
|
||||
{
|
||||
@ -501,15 +509,24 @@ void CV_GpuBruteForceMatcherTest::dataTest(int dim)
|
||||
|
||||
void CV_GpuBruteForceMatcherTest::run(int)
|
||||
{
|
||||
emptyDataTest();
|
||||
try
|
||||
{
|
||||
emptyDataTest();
|
||||
|
||||
dataTest(50);
|
||||
dataTest(64);
|
||||
dataTest(100);
|
||||
dataTest(128);
|
||||
dataTest(200);
|
||||
dataTest(256);
|
||||
dataTest(300);
|
||||
dataTest(50);
|
||||
dataTest(64);
|
||||
dataTest(100);
|
||||
dataTest(128);
|
||||
dataTest(200);
|
||||
dataTest(256);
|
||||
dataTest(300);
|
||||
}
|
||||
catch(cv::Exception& e)
|
||||
{
|
||||
if (!check_and_treat_gpu_exception(e, ts))
|
||||
throw;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test;
|
||||
|
@ -154,7 +154,7 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
|
||||
return;
|
||||
}
|
||||
|
||||
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
|
||||
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
|
||||
{
|
||||
ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
|
||||
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
|
||||
@ -221,10 +221,19 @@ void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)
|
||||
|
||||
void CV_GPU_SURFTest::run( int /*start_from*/ )
|
||||
{
|
||||
SURF_GPU fdetector;
|
||||
try
|
||||
{
|
||||
SURF_GPU fdetector;
|
||||
|
||||
emptyDataTest(fdetector);
|
||||
regressionTest(fdetector);
|
||||
emptyDataTest(fdetector);
|
||||
regressionTest(fdetector);
|
||||
}
|
||||
catch(cv::Exception& e)
|
||||
{
|
||||
if (!check_and_treat_gpu_exception(e, ts))
|
||||
throw;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
CV_GPU_SURFTest CV_GPU_SURF_test;
|
||||
|
@ -43,15 +43,15 @@
|
||||
|
||||
CvTS test_system("gpu");
|
||||
|
||||
const char* blacklist[] =
|
||||
{
|
||||
"GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR
|
||||
0
|
||||
};
|
||||
//const char* blacklist[] =
|
||||
//{
|
||||
// "GPU-NVidia",
|
||||
// 0
|
||||
//};
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
return test_system.run( argc, argv, blacklist );
|
||||
return test_system.run( argc, argv );
|
||||
}
|
||||
|
||||
/* End of file. */
|
||||
|
@ -43,6 +43,9 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::gpu;
|
||||
|
||||
|
||||
struct CV_GpuMeanShiftTest : public CvTest
|
||||
{
|
||||
@ -50,6 +53,14 @@ struct CV_GpuMeanShiftTest : public CvTest
|
||||
|
||||
void run(int)
|
||||
{
|
||||
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
|
||||
if (!cc12_ok)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
|
||||
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
|
||||
return;
|
||||
}
|
||||
|
||||
int spatialRad = 30;
|
||||
int colorRad = 30;
|
||||
|
||||
@ -134,6 +145,14 @@ struct CV_GpuMeanShiftProcTest : public CvTest
|
||||
|
||||
void run(int)
|
||||
{
|
||||
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
|
||||
if (!cc12_ok)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
|
||||
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
|
||||
return;
|
||||
}
|
||||
|
||||
int spatialRad = 30;
|
||||
int colorRad = 30;
|
||||
|
||||
|
@ -54,6 +54,14 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
|
||||
{
|
||||
try
|
||||
{
|
||||
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
|
||||
if (!cc12_ok)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
|
||||
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
|
||||
return;
|
||||
}
|
||||
|
||||
Mat img_rgb = imread(string(ts->get_data_path()) + "meanshift/cones.png");
|
||||
if (img_rgb.empty())
|
||||
{
|
||||
|
@ -91,14 +91,14 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
|
||||
Mat cpumatdst;
|
||||
GpuMat gpumatdst;
|
||||
|
||||
cpumatsrc.convertTo(cpumatdst, dst_type);
|
||||
gpumatsrc.convertTo(gpumatdst, dst_type);
|
||||
cpumatsrc.convertTo(cpumatdst, dst_type, 0.5, 3.0);
|
||||
gpumatsrc.convertTo(gpumatdst, dst_type, 0.5, 3.0);
|
||||
|
||||
double r = norm(cpumatdst, gpumatdst, NORM_INF);
|
||||
if (r > 1)
|
||||
{
|
||||
ts->printf(CvTS::LOG,
|
||||
"\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %d\n",
|
||||
"\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %f\n",
|
||||
types_str[i], c, types_str[j], r);
|
||||
passed = false;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user