Merge pull request #3924 from jet47:gpu-fixes

This commit is contained in:
Alexander Smorkalov 2015-04-19 11:04:54 +00:00
commit de8d8720a2
10 changed files with 27 additions and 9 deletions

View File

@ -123,7 +123,7 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
sortKeyPoints(gpu_keypoints, gpu_descriptors);
SANITY_CHECK_KEYPOINTS(gpu_keypoints);
SANITY_CHECK_KEYPOINTS(gpu_keypoints, 1e-10);
SANITY_CHECK(gpu_descriptors);
}
else

View File

@ -1011,7 +1011,7 @@ PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums,
TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag);
GPU_SANITY_CHECK(dst);
GPU_SANITY_CHECK(dst, 2);
}
else
{
@ -1045,7 +1045,7 @@ PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums,
TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
GPU_SANITY_CHECK(dst);
GPU_SANITY_CHECK(dst, 1e-5);
}
else
{

View File

@ -340,8 +340,8 @@ PERF_TEST_P(ImagePair_WinSz_Levels_Iters, Video_PyrLKOpticalFlowDense,
TEST_CYCLE() d_pyrLK.dense(d_frame0, d_frame1, u, v);
GPU_SANITY_CHECK(u);
GPU_SANITY_CHECK(v);
GPU_SANITY_CHECK(u, 0.5);
GPU_SANITY_CHECK(v, 0.5);
}
else
{

View File

@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
// Calc distance kernel
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
// find knn match kernel
template <int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;

View File

@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{

View File

@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
// Match
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{

View File

@ -288,7 +288,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
Ncv32u curElemOffs = offsetX + threadIdx.x;
T_out curScanElem;
T_in curElem;
T_in curElem = 0;
T_out curElemMod;
if (curElemOffs < srcWidth)

View File

@ -58,7 +58,7 @@ int main(int argc, char** argv)
"{ h | help ? | false | Print help}"
"{ i | info | false | Print information about system and exit }"
"{ d | device | -1 | Device on which tests will be executed (-1 means all devices) }"
"{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level (none, compact, full) }"
"{ nvtest_output_level | nvtest_output_level | full | NVidia test verbosity level (none, compact, full) }"
;
CommandLineParser cmd(argc, (const char**)argv, keys);

View File

@ -323,6 +323,7 @@ static void devNullOutput(const std::string& msg)
bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel)
{
path = test_data_path.c_str();
if (outputLevel != OutputLevelFull)
ncvSetDebugOutputHandler(devNullOutput);
NCVAutoTestLister testListerII("NPPST Integral Image", outputLevel);

View File

@ -44,7 +44,7 @@
#ifdef HAVE_CUDA
OutputLevel nvidiaTestOutputLevel = OutputLevelNone;
OutputLevel nvidiaTestOutputLevel = OutputLevelFull;
using namespace cvtest;
using namespace testing;