diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index b84b29f92a..844df9be2f 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device } template - __global__ void resise_scan_fast_x(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines) + __global__ void resise_scan_fast_x(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines, int stride) { extern __shared__ W sbuf[]; @@ -545,11 +545,14 @@ namespace cv { namespace gpu { namespace device // load line-block on shared memory int y = blockIdx.x / thred_lines; - int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x; + int input_stride = (blockIdx.x % thred_lines) * stride; int x = input_stride + tid; // store global data in shared memory - sbuf[tid] = src(y, x); + if (x < src.cols && y < src.rows) + sbuf[tid] = src(y, x); + else + sbuf[tid] = 0; __syncthreads(); scan_block(sbuf); @@ -575,7 +578,7 @@ namespace cv { namespace gpu { namespace device } template - __global__ void resise_scan_fast_y(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines) + __global__ void resise_scan_fast_y(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines, int stride) { extern __shared__ W sbuf[]; @@ -584,13 +587,15 @@ namespace cv { namespace gpu { namespace device // load line-block on shared memory int x = blockIdx.x / thred_lines; - int global_stride = (blockIdx.x % thred_lines) * blockDim.x; - if (!tid) printf("STRIDE : %d", global_stride); + int global_stride = (blockIdx.x % thred_lines) * stride; int y = global_stride + tid; // store global data in shared memory + if (x < src.cols && y < src.rows) + sbuf[tid] = src(y, x); + else + sbuf[tid] = 0; - sbuf[tid] = src(y, x); __syncthreads(); scan_block(sbuf); @@ -623,28 +628,30 @@ namespace cv { namespace gpu { namespace device int iscale_x = round(fx); int iscale_y = round(fy); - const int warps = 4; + int warps = 4; const int threads = 32 * warps; + int input_stride = threads / iscale_x; - int thred_lines = divUp(src.cols, threads); + int thred_lines = divUp(src.cols, input_stride * iscale_x); int blocks = src.rows * thred_lines; - printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n", - src.cols, warps, threads, thred_lines, blocks); + printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d input strude %d\n", + src.cols, warps, threads, thred_lines, blocks, input_stride * iscale_x); typedef typename scan_traits::scan_line_type smem_type; resise_scan_fast_x<<>> - (src, buffer, iscale_x, iscale_y, thred_lines); + (src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x); - thred_lines = divUp(src.rows, threads); + input_stride = threads / iscale_y; + thred_lines = divUp(src.rows, input_stride * iscale_y); blocks = dst.cols * thred_lines; printf("device code executed for Y coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n", dst.rows, warps, threads, thred_lines, blocks); resise_scan_fast_y<<>> - (buffer, dst, iscale_x, iscale_y, thred_lines); + (buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp index ff41818848..25bdce42d2 100644 --- a/modules/gpu/src/resize.cpp +++ b/modules/gpu/src/resize.cpp @@ -95,6 +95,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, CV_Assert( (fx < 1.0) && (fy < 1.0)); CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0)); CV_Assert(src.cols >= 128 && src.rows >= 128); + CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0); if (dsize == Size()) dsize = Size(saturate_cast(src.cols * fx), saturate_cast(src.rows * fy)); diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp index 5e03786828..81de33a1bd 100644 --- a/modules/gpu/test/test_resize.cpp +++ b/modules/gpu/test/test_resize.cpp @@ -201,13 +201,13 @@ TEST_P(ResizeArea, Accuracy) cv::Mat gpu; dst.download(gpu); - std::cout //<< src + // std::cout // << src + // // << std::endl << std::endl + // // << gpu_buff + // // << std::endl << std::endl + // << gpu // << std::endl << std::endl - // << gpu_buff - // << std::endl << std::endl - << gpu - << std::endl << std::endl - << dst_cpu<< std::endl; + // << dst_cpu<< std::endl; EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0); @@ -215,9 +215,9 @@ TEST_P(ResizeArea, Accuracy) INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine( ALL_DEVICES, - testing::Values(cv::Size(640, 10 * 128)),//DIFFERENT_SIZES, + testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES, testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/), - testing::Values(0.1), + testing::Values(0.05, 0.1), testing::Values(Interpolation(cv::INTER_AREA)), WHOLE_SUBMAT));