diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp index 45823cef2c..2b1ab58129 100644 --- a/modules/gpu/perf/perf_features2d.cpp +++ b/modules/gpu/perf/perf_features2d.cpp @@ -123,7 +123,7 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB, sortKeyPoints(gpu_keypoints, gpu_descriptors); - SANITY_CHECK_KEYPOINTS(gpu_keypoints); + SANITY_CHECK_KEYPOINTS(gpu_keypoints, 1e-10); SANITY_CHECK(gpu_descriptors); } else diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 680fb35432..c7c1022941 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1011,7 +1011,7 @@ PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums, TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag); - GPU_SANITY_CHECK(dst); + GPU_SANITY_CHECK(dst, 2); } else { @@ -1045,7 +1045,7 @@ PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums, TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false); - GPU_SANITY_CHECK(dst); + GPU_SANITY_CHECK(dst, 1e-5); } else { diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index 2d33ff9324..e71df5a887 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -340,8 +340,8 @@ PERF_TEST_P(ImagePair_WinSz_Levels_Iters, Video_PyrLKOpticalFlowDense, TEST_CYCLE() d_pyrLK.dense(d_frame0, d_frame1, u, v); - GPU_SANITY_CHECK(u); - GPU_SANITY_CHECK(v); + GPU_SANITY_CHECK(u, 0.5); + GPU_SANITY_CHECK(v, 0.5); } else { diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 66e37d088a..3e5bc741ff 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) { extern __shared__ int smem[]; @@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device // Calc distance kernel template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void calcDistanceUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, PtrStepf allDist) { extern __shared__ int smem[]; @@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void calcDistance(const PtrStepSz query, const PtrStepSz train, const Mask mask, PtrStepf allDist) { extern __shared__ int smem[]; @@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device // find knn match kernel template + __launch_bounds__(BLOCK_SIZE) __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance) { const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f7bdcdc0f1..c2ae48bb30 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolledCached(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { @@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { @@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz train, const Mask mask, int* bestTrainIdx, float* bestDistance) { extern __shared__ int smem[]; @@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device } template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, const PtrStepSz* trains, int n, const Mask mask, int* bestTrainIdx, int* bestImgIdx, float* bestDistance) { diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 44cd2b55f9..d83f9f7f96 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device // Match Unrolled template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { @@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device // Match template + __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE) __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu index 525feb68b7..af65bbf3e3 100644 --- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu @@ -288,7 +288,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr Ncv32u curElemOffs = offsetX + threadIdx.x; T_out curScanElem; - T_in curElem; + T_in curElem = 0; T_out curElemMod; if (curElemOffs < srcWidth) diff --git a/modules/gpu/test/main.cpp b/modules/gpu/test/main.cpp index 01a29618b0..f9549fb8d6 100644 --- a/modules/gpu/test/main.cpp +++ b/modules/gpu/test/main.cpp @@ -58,7 +58,7 @@ int main(int argc, char** argv) "{ h | help ? | false | Print help}" "{ i | info | false | Print information about system and exit }" "{ d | device | -1 | Device on which tests will be executed (-1 means all devices) }" - "{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level (none, compact, full) }" + "{ nvtest_output_level | nvtest_output_level | full | NVidia test verbosity level (none, compact, full) }" ; CommandLineParser cmd(argc, (const char**)argv, keys); diff --git a/modules/gpu/test/nvidia/main_nvidia.cpp b/modules/gpu/test/nvidia/main_nvidia.cpp index 07083151ce..32f83d3d99 100644 --- a/modules/gpu/test/nvidia/main_nvidia.cpp +++ b/modules/gpu/test/nvidia/main_nvidia.cpp @@ -323,7 +323,8 @@ static void devNullOutput(const std::string& msg) bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel) { path = test_data_path.c_str(); - ncvSetDebugOutputHandler(devNullOutput); + if (outputLevel != OutputLevelFull) + ncvSetDebugOutputHandler(devNullOutput); NCVAutoTestLister testListerII("NPPST Integral Image", outputLevel); diff --git a/modules/gpu/test/test_nvidia.cpp b/modules/gpu/test/test_nvidia.cpp index d713b41bdb..e0653562ec 100644 --- a/modules/gpu/test/test_nvidia.cpp +++ b/modules/gpu/test/test_nvidia.cpp @@ -44,7 +44,7 @@ #ifdef HAVE_CUDA -OutputLevel nvidiaTestOutputLevel = OutputLevelNone; +OutputLevel nvidiaTestOutputLevel = OutputLevelFull; using namespace cvtest; using namespace testing;