From 2ec1d1c986a139d399698bb9511d6033fa79798c Mon Sep 17 00:00:00 2001 From: kallaballa Date: Mon, 13 Nov 2023 09:48:48 +0100 Subject: [PATCH] use precomputed coefficents --- modules/imgproc/src/opencl/resize.cl | 106 ++++++++++++++++++--------- modules/imgproc/src/resize.cpp | 42 ++++++++++- 2 files changed, 112 insertions(+), 36 deletions(-) diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl index 3cbd83db5b..197af5bc63 100644 --- a/modules/imgproc/src/opencl/resize.cl +++ b/modules/imgproc/src/opencl/resize.cl @@ -128,6 +128,21 @@ __kernel void resizeSampler(__read_only image2d_t srcImage, #elif defined INTER_LINEAR_INTEGER +#define FIXED_POINT_BITS 16 +#define FIXED_POINT_SCALE (1 << FIXED_POINT_BITS) + +// Fixed-point multiply +#define FIXED_MUL(a, b) (((a) * (b)) >> FIXED_POINT_BITS) + +// Rounding methods +#define ROUND_NEAREST_EVEN 0 +#define ROUND_DOWN 1 +#define ROUND_UP 2 +#define TRUNCATE 3 + +// Choose rounding method +#define ROUNDING_METHOD ROUND_NEAREST_EVEN + __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * buffer) @@ -162,6 +177,9 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs } } + + + #elif defined INTER_LINEAR __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, @@ -222,55 +240,77 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs #elif defined INTER_LINEAR_EXACT -#define FIXED_POINT_BITS 8 +#define FIXED_POINT_BITS 16 #define FIXED_POINT_SCALE (1 << FIXED_POINT_BITS) // Fixed-point multiply #define FIXED_MUL(a, b) (((a) * (b)) >> FIXED_POINT_BITS) +// Rounding methods +#define ROUND_NEAREST_EVEN 0 +#define ROUND_DOWN 1 +#define ROUND_UP 2 +#define TRUNCATE 3 + +// Choose rounding method +#define ROUNDING_METHOD ROUND_NEAREST_EVEN + __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int ifx, int ify) + __global const int * xofs, __global const int * yofs, + __global const short * ialpha, __global const short * ibeta) { int dx = get_global_id(0); int dy = get_global_id(1); - if (dx < dst_cols && dy < dst_rows) + if (dx >= dst_cols || dy >= dst_rows) { - // Calculate source coordinates - int sx = (dx * ifx) >> 16; - int sy = (dy * ify) >> 16; - - // Perform boundary checks - sx = clamp(sx, 0, src_cols - 1); - sy = clamp(sy, 0, src_rows - 1); - - // Calculate interpolation coefficients - int u = (dx * ifx) & 0xFFFF; - int v = (dy * ify) & 0xFFFF; - - int U = (0x10000 - u) >> 8; - int V = (0x10000 - v) >> 8; - int U1 = u >> 8; - int V1 = v >> 8; - - // Load pixel values - WT data0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))); - WT data1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(INC(sx, src_cols), TSIZE, src_offset)))); - WT data2 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(sx, TSIZE, src_offset)))); - WT data3 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(INC(sx, src_cols), TSIZE, src_offset)))); - - // Perform fixed-point interpolation - WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) + - mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3); - - // Convert and store the result - T uval = convertToDT((val + 2) >> 2); - storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset))); + return; // Exit if dx or dy is out of bounds } + + // Calculate source coordinates + int sx = (dx * ifx) >> 16; + int sy = (dy * ify) >> 16; + + // Perform boundary checks + sx = clamp(sx, 0, src_cols - 1); + sy = clamp(sy, 0, src_rows - 1); + + // Calculate interpolation coefficients + int u = (dx * ifx) & 0xFFFF; + int v = (dy * ify) & 0xFFFF; + + int U = (0x10000 - u) >> 8; + int V = (0x10000 - v) >> 8; + int U1 = u >> 8; + int V1 = v >> 8; + + // Load pixel values + WT data0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset)))); + WT data1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(INC(sx, src_cols), TSIZE, src_offset)))); + WT data2 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(sx, TSIZE, src_offset)))); + WT data3 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(INC(sx, src_cols), TSIZE, src_offset)))); + + // Perform fixed-point interpolation + WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) + + mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3); + + // Convert and store the result +#if ROUNDING_METHOD == ROUND_NEAREST_EVEN + T uval = convertToDT((val + FIXED_POINT_SCALE / 2) >> FIXED_POINT_BITS); +#elif ROUNDING_METHOD == ROUND_DOWN + T uval = convertToDT(val >> FIXED_POINT_BITS); +#elif ROUNDING_METHOD == ROUND_UP + T uval = convertToDT((val + FIXED_POINT_SCALE - 1) >> FIXED_POINT_BITS); +#elif ROUNDING_METHOD == TRUNCATE + T uval = convertToDT(val >> FIXED_POINT_BITS); +#endif + storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset))); } + + #elif defined INTER_NEAREST __kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 1226831c21..ac68a76c37 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -3495,9 +3495,45 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, } } else if (interpolation == INTER_LINEAR_EXACT) { + AutoBuffer _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2)); + int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width; + short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2; + float fxx, fyy; + int sx, sy; + + for (int dx = 0; dx < dsize.width; dx++) + { + fxx = (float)((dx+0.5)*inv_fx - 0.5); + sx = cvFloor(fxx); + fxx -= sx; + + if (sx < 0) + fxx = 0, sx = 0; + + if (sx >= ssize.width-1) + fxx = 0, sx = ssize.width-1; + + xofs[dx] = sx; + ialpha[dx*2 + 0] = saturate_cast((1.f - fxx) * INTER_RESIZE_COEF_SCALE); + ialpha[dx*2 + 1] = saturate_cast(fxx * INTER_RESIZE_COEF_SCALE); + } + + for (int dy = 0; dy < dsize.height; dy++) + { + fyy = (float)((dy+0.5)*inv_fy - 0.5); + sy = cvFloor(fyy); + fyy -= sy; + + yofs[dy] = sy; + ibeta[dy*2 + 0] = saturate_cast((1.f - fyy) * INTER_RESIZE_COEF_SCALE); + ibeta[dy*2 + 1] = saturate_cast(fyy * INTER_RESIZE_COEF_SCALE); + } + + int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); + UMat coeffs; + Mat(1, static_cast(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs); + char buf[2][50]; - int wdepth = depth <= CV_8S ? CV_32S : std::max(depth, CV_32F); - int wtype = CV_MAKETYPE(wdepth, cn); k.create("resizeLN", ocl::imgproc::resize_oclsrc, format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s " "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " @@ -3510,7 +3546,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, return false; k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), - (float)inv_fx, (float)inv_fy); + ocl::KernelArg::PtrReadOnly(coeffs)); } else if (interpolation == INTER_NEAREST) {