diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index 6ecb7eb8ba..dc9f462d16 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -52,368 +52,431 @@ namespace cv { namespace gpu { namespace device { - namespace imgproc + // kernels + + template __global__ void resize_nearest(const PtrStep src, PtrStepSz dst, const float fy, const float fx) { - template __global__ void resize_nearest(const PtrStep src, const float fx, const float fy, PtrStepSz dst) + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + + if (dst_x < dst.cols && dst_y < dst.rows) { - const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; - const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; - if (dst_x < dst.cols && dst_y < dst.rows) - { - const float src_x = dst_x * fx; - const float src_y = dst_y * fy; + dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x)); + } + } - dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x)); - } + template __global__ void resize_linear(const PtrStepSz src, PtrStepSz dst, const float fy, const float fx) + { + typedef typename TypeVec::cn>::vec_type work_type; + + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + + if (dst_x < dst.cols && dst_y < dst.rows) + { + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; + + work_type out = VecTraits::all(0); + + const int x1 = __float2int_rd(src_x); + const int y1 = __float2int_rd(src_y); + const int x2 = x1 + 1; + const int y2 = y1 + 1; + const int x2_read = ::min(x2, src.cols - 1); + const int y2_read = ::min(y2, src.rows - 1); + + T src_reg = src(y1, x1); + out = out + src_reg * ((x2 - src_x) * (y2 - src_y)); + + src_reg = src(y1, x2_read); + out = out + src_reg * ((src_x - x1) * (y2 - src_y)); + + src_reg = src(y2_read, x1); + out = out + src_reg * ((x2 - src_x) * (src_y - y1)); + + src_reg = src(y2_read, x2_read); + out = out + src_reg * ((src_x - x1) * (src_y - y1)); + + dst(dst_y, dst_x) = saturate_cast(out); + } + } + + template __global__ void resize(const Ptr2D src, PtrStepSz dst, const float fy, const float fx) + { + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + + if (dst_x < dst.cols && dst_y < dst.rows) + { + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; + + dst(dst_y, dst_x) = src(src_y, src_x); + } + } + + template __global__ void resize_area(const Ptr2D src, PtrStepSz dst) + { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < dst.cols && y < dst.rows) + { + dst(y, x) = src(y, x); + } + } + + // textures + + template struct TextureAccessor; + + #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \ + texture tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \ + template <> struct TextureAccessor \ + { \ + typedef type elem_type; \ + typedef int index_type; \ + int xoff; \ + int yoff; \ + __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ + { \ + return tex2D(tex_resize_##type, x + xoff, y + yoff); \ + } \ + __host__ static void bind(const PtrStepSz& mat) \ + { \ + bindTexture(&tex_resize_##type, mat); \ + } \ + }; + + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar) + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4) + + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort) + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4) + + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short) + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4) + + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float) + OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4) + + #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX + + template + TextureAccessor texAccessor(const PtrStepSz& mat, int yoff, int xoff) + { + TextureAccessor::bind(mat); + + TextureAccessor t; + t.xoff = xoff; + t.yoff = yoff; + + return t; + } + + // callers for nearest interpolation + + template + void call_resize_nearest_glob(const PtrStepSz& src, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize_nearest<<>>(src, dst, fy, fx); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void call_resize_nearest_tex(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize<<>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + + // callers for linear interpolation + + template + void call_resize_linear_glob(const PtrStepSz& src, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize_linear<<>>(src, dst, fy, fx); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void call_resize_linear_tex(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + if (srcWhole.data == src.data) + { + TextureAccessor texSrc = texAccessor(src, 0, 0); + LinearFilter< TextureAccessor > filteredSrc(texSrc); + + resize<<>>(filteredSrc, dst, fy, fx); + } + else + { + TextureAccessor texSrc = texAccessor(srcWhole, yoff, xoff); + + BrdReplicate brd(src.rows, src.cols); + BorderReader, BrdReplicate > brdSrc(texSrc, brd); + LinearFilter< BorderReader, BrdReplicate > > filteredSrc(brdSrc); + + resize<<>>(filteredSrc, dst, fy, fx); } - template __global__ void resize_linear(const PtrStepSz src, const float fx, const float fy, PtrStepSz dst) + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + + // callers for cubic interpolation + + template + void call_resize_cubic_glob(const PtrStepSz& src, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + BrdReplicate brd(src.rows, src.cols); + BorderReader< PtrStep, BrdReplicate > brdSrc(src, brd); + CubicFilter< BorderReader< PtrStep, BrdReplicate > > filteredSrc(brdSrc); + + resize<<>>(filteredSrc, dst, fy, fx); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void call_resize_cubic_tex(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + if (srcWhole.data == src.data) { - typedef typename TypeVec::cn>::vec_type work_type; + TextureAccessor texSrc = texAccessor(src, 0, 0); + CubicFilter< TextureAccessor > filteredSrc(texSrc); - const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; - const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + resize<<>>(filteredSrc, dst, fy, fx); + } + else + { + TextureAccessor texSrc = texAccessor(srcWhole, yoff, xoff); - if (dst_x < dst.cols && dst_y < dst.rows) - { - const float src_x = dst_x * fx; - const float src_y = dst_y * fy; + BrdReplicate brd(src.rows, src.cols); + BorderReader, BrdReplicate > brdSrc(texSrc, brd); + CubicFilter< BorderReader, BrdReplicate > > filteredSrc(brdSrc); - work_type out = VecTraits::all(0); - - const int x1 = __float2int_rd(src_x); - const int y1 = __float2int_rd(src_y); - const int x2 = x1 + 1; - const int y2 = y1 + 1; - const int x2_read = ::min(x2, src.cols - 1); - const int y2_read = ::min(y2, src.rows - 1); - - T src_reg = src(y1, x1); - out = out + src_reg * ((x2 - src_x) * (y2 - src_y)); - - src_reg = src(y1, x2_read); - out = out + src_reg * ((src_x - x1) * (y2 - src_y)); - - src_reg = src(y2_read, x1); - out = out + src_reg * ((x2 - src_x) * (src_y - y1)); - - src_reg = src(y2_read, x2_read); - out = out + src_reg * ((src_x - x1) * (src_y - y1)); - - dst(dst_y, dst_x) = saturate_cast(out); - } + resize<<>>(filteredSrc, dst, fy, fx); } - template __global__ void resize(const Ptr2D src, const float fx, const float fy, PtrStepSz dst) + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + + // ResizeNearestDispatcher + + template struct ResizeNearestDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) { - const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; - const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; - - if (dst_x < dst.cols && dst_y < dst.rows) - { - const float src_x = dst_x * fx; - const float src_y = dst_y * fy; - - dst(dst_y, dst_x) = src(src_y, src_x); - } + call_resize_nearest_glob(src, dst, fy, fx, stream); } + }; - template