diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index a0686d0d56..f3b4135557 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -282,27 +282,232 @@ namespace cv { namespace gpu { namespace device template<> struct scan_traits { - typedef int scan_line_type; + typedef float scan_line_type; }; - template - __global__ void resize_area_scan(const Ptr2D src, int fx, int fy, DevMem2D_ dst, DevMem2D_ buffer) +// template +// __global__ void resize_area_scan(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, DevMem2D_ buffer) +// { +// typedef typename scan_traits::scan_line_type W; +// extern __shared__ W line[]; + +// const int x = threadIdx.x; +// const int y = blockIdx.x; + +// if (y >= src.rows) return; + +// int offset = 1; + +// line[2 * x + 0] = src(y, 2 * x + 0); +// line[2 * x + 1] = src(y, 2 * x + 1); + +// __syncthreads();//??? +// // reduction +// for (int d = blockDim.x; d > 0; d >>= 1) +// { +// __syncthreads(); +// if (x < d) +// { +// int ai = 2 * x * offset -1 + 1 * offset; +// int bi = 2 * x * offset -1 + 2 * offset; +// line[bi] += line[ai]; +// } + +// offset *= 2; +// } + +// __syncthreads(); +// // convolution +// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);} + +// for (int d = 1; d < (blockDim.x << 1); d *= 2) +// { +// offset >>= 1; + +// __syncthreads(); +// if (x < d) +// { +// int ai = offset * 2 * x + 1 * offset - 1; +// int bi = offset * 2 * x + 2 * offset - 1; + +// W t = line[ai]; +// line[ai] = line[bi]; +// line[bi] += t; +// } +// } +// __syncthreads(); + +// // calculate sum +// int start = 0; +// int out_idx = 0; +// int end = start + fx; +// while (start < (blockDim.x << 1) && end < (blockDim.x << 1)) +// { +// buffer(y, out_idx) = saturate_cast((line[end] - line[start]) / fx); +// start = end; +// end = start + fx; +// out_idx++; +// } + +// } + + template + __device__ void scan_y(DevMem2D_::scan_line_type> buffer,int fx, int fy, DevMem2D_ dst, + typename scan_traits::scan_line_type* line, int g_base) + { + typedef typename scan_traits::scan_line_type W; + + const int y = threadIdx.x; + const int x = blockIdx.x; + + float scale = 1.f / (fx * fy); + + if (x >= buffer.cols) return; + + int offset = 1; + line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x); + + if (y != (blockDim.x -1) ) + line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x); + else + line[2 * y + 1] = 0; + + __syncthreads(); + + // reduction + for (int d = blockDim.x; d > 0; d >>= 1) + { + __syncthreads(); + if (y < d) + { + int ai = 2 * y * offset -1 + 1 * offset; + int bi = 2 * y * offset -1 + 2 * offset; + line[bi] += line[ai]; + } + + offset *= 2; + } + + __syncthreads(); + // convolution + if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x); + + for (int d = 1; d < (blockDim.x << 1); d *= 2) + { + offset >>= 1; + + __syncthreads(); + if (y < d) + { + int ai = offset * 2 * y + 1 * offset - 1; + int bi = offset * 2 * y + 2 * offset - 1; + + + W t = line[ai]; + line[ai] = line[bi]; + line[bi] += t; + } + } + __syncthreads(); + + if (y < dst.rows) + { + W start = (y == 0)? (W)0:line[y * fy -1]; + W end = line[y * fy + fy - 1]; + dst(g_base + y ,x) = saturate_cast((end - start) * scale); + } + } + + template + __device__ void scan_x(const DevMem2D_ src, int fx, int fy, DevMem2D_::scan_line_type> buffer, + typename scan_traits::scan_line_type* line, int g_base) + { + typedef typename scan_traits::scan_line_type W; + + const int x = threadIdx.x; + const int y = blockIdx.x; + + float scale = 1.f / (fx * fy); + + if (y >= src.rows) return; + + int offset = 1; + + line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1); + + if (x != (blockDim.x -1) ) + line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2); + else + line[2 * x + 1] = 0; + + __syncthreads(); + + // reduction + for (int d = blockDim.x; d > 0; d >>= 1) + { + __syncthreads(); + if (x < d) + { + int ai = 2 * x * offset -1 + 1 * offset; + int bi = 2 * x * offset -1 + 2 * offset; + line[bi] += line[ai]; + } + + offset *= 2; + } + + __syncthreads(); + // convolution + if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0); + + for (int d = 1; d < (blockDim.x << 1); d *= 2) + { + offset >>= 1; + + __syncthreads(); + if (x < d) + { + int ai = offset * 2 * x + 1 * offset - 1; + int bi = offset * 2 * x + 2 * offset - 1; + + W t = line[ai]; + line[ai] = line[bi]; + line[bi] += t; + } + } + __syncthreads(); + + if (x < buffer.cols) + { + W start = (x == 0)? (W)0:line[x * fx -1]; + W end = line[x * fx + fx - 1]; + buffer(y, g_base + x) =(end - start); + } + } + + template + __global__ void resize_area_scan_x(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, DevMem2D_::scan_line_type> buffer) { typedef typename scan_traits::scan_line_type W; extern __shared__ W line[]; + scan_x(src,fx,fy, buffer,line, 0); + } - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + template + __global__ void resize_area_scan_y(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, DevMem2D_::scan_line_type> buffer) + { + typedef typename scan_traits::scan_line_type W; + extern __shared__ W line[]; + scan_y(buffer,fx, fy, dst, line, 0); } template struct InterAreaDispatcherStream { - static void call(DevMem2D_ src, int fx, int fy, DevMem2D_ dst, DevMem2D_ buffer, cudaStream_t stream) + static void call(const DevMem2D_ src, int fx, int fy, DevMem2D_ dst, DevMem2D_::scan_line_type> buffer, cudaStream_t stream) { - dim3 block(256, 1); - dim3 grid(divUp(dst.cols, block.x), 1); + resize_area_scan_x<<> 1), src.cols * sizeof(typename scan_traits::scan_line_type) >>>(src, dst, fx, fy, buffer); - resize_area_scan<<::scan_line_type) >>>(src, fx, fy, dst, buffer); + resize_area_scan_y<<> 1), src.rows * sizeof(typename scan_traits::scan_line_type) >>>(src, dst, fx, fy, buffer); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -311,8 +516,8 @@ namespace cv { namespace gpu { namespace device }; template - void resize_area_gpu(DevMem2Db src, DevMem2Db dst,float fx, float fy, - int interpolation, DevMem2Db buffer, cudaStream_t stream) + void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy, + int interpolation, DevMem2Df buffer, cudaStream_t stream) { (void)interpolation; @@ -322,7 +527,7 @@ namespace cv { namespace gpu { namespace device InterAreaDispatcherStream::call(src, iscale_x, iscale_y, dst, buffer, stream); } - template void resize_area_gpu(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Db buffer, cudaStream_t stream); + template void resize_area_gpu(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream); } // namespace imgproc }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp index 940b9c2d5f..7718097a1c 100644 --- a/modules/gpu/src/resize.cpp +++ b/modules/gpu/src/resize.cpp @@ -82,8 +82,8 @@ namespace cv { namespace gpu { namespace device DevMem2Db dst, int interpolation, cudaStream_t stream); template - void resize_area_gpu(DevMem2Db src, DevMem2Db dst,float fx, float fy, - int interpolation, DevMem2Db buffer, cudaStream_t stream); + void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy, + int interpolation, DevMem2Df buffer, cudaStream_t stream); } }}} @@ -107,7 +107,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, GpuMat& buffer, fy = static_cast(1.0 / fy); dst.create(dsize, src.type()); - buffer.create(cv::Size(dsize.width, src.rows), src.type()); + buffer.create(cv::Size(dsize.width, src.rows), CV_32FC1); if (dsize == src.size()) { diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp index 22d7ba3fd7..879e5c0077 100644 --- a/modules/gpu/test/test_resize.cpp +++ b/modules/gpu/test/test_resize.cpp @@ -40,6 +40,7 @@ //M*/ #include "precomp.hpp" +#include #ifdef HAVE_CUDA @@ -186,19 +187,37 @@ TEST_P(ResizeArea, Accuracy) cv::Mat src = randomMat(size, type); cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)), type, useRoi); - cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation); + cv::gpu::GpuMat buffer = createMat(cv::Size(dst.cols, src.rows), CV_32FC1); + + cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), buffer, coeff, coeff, interpolation); cv::Mat dst_cpu; + cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, interpolation); +// cv::Mat gpu_buff; +// buffer.download(gpu_buff); + +// cv::Mat gpu; +// dst.download(gpu); + +// std::cout << src +// << std::endl << std::endl +// << gpu_buff +// << std::endl << std::endl +// << gpu +// << std::endl << std::endl +// << dst_cpu<< std::endl; + + EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0); } INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine( ALL_DEVICES, - DIFFERENT_SIZES, - testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), - testing::Values(0.3, 0.5), + testing::Values(cv::Size(512, 256)),//DIFFERENT_SIZES, + testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/), + testing::Values(0.5), testing::Values(Interpolation(cv::INTER_AREA)), WHOLE_SUBMAT));