ocl_resize: move coeffienct calculation for INTER_LINEAR_EXACT from C++ to OpenCL

This commit is contained in:
kallaballa 2023-11-17 10:15:31 +01:00
parent 6dd905eb53
commit 672e06c408
2 changed files with 83 additions and 65 deletions

View File

@ -222,40 +222,75 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
#elif defined INTER_LINEAR_EXACT
__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global const uchar * buffer)
{
__kernel void precomputeCoeffs(__read_only image2d_t src, __write_only image2d_t coeffsTex, float inv_fx, float inv_fy) {
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx, sy;
float fxx, fyy;
short ialpha, ibeta;
if (dx < dst_cols && dy < dst_rows)
{
__global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
__global const short * ialpha = (__global const short *)(yofs + dst_rows);
__global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
ialpha += dx << 1;
fxx = (float)((dx + 0.5) * inv_fx * 0.5 - 0.5);
fyy = (float)((dy + 0.5) * inv_fy * 0.5 - 0.5);
int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
short a0 = ialpha[0], a1 = ialpha[1];
short b0 = ibeta[0], b1 = ibeta[1];
sx = floor(fxx);
fxx -= sx;
if (sx < 0)
fxx = 0, sx = 0;
if (sx >= get_image_width(src) - 1)
fxx = 0, sx = get_image_width(src) - 1;
int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
WT data0 = convertToWT(loadpix(srcptr + src_index0));
WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
WT data2 = convertToWT(loadpix(srcptr + src_index1));
WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
sy = floor(fyy);
fyy -= sy;
if (sy < 0)
fyy = 0, sy = 0;
if (sy >= get_image_height(src) - 1)
fyy = 0, sy = get_image_height(src) - 1;
WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
ialpha = convert_short_sat_rte((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
ibeta = convert_short_sat_rte((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
storepix(convertToDT((val + 2) >> 2),
dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
}
write_imagei(coeffsTex, (int2)(dx, 0), (int4)(sx, ialpha, 0, 0));
write_imagei(coeffsTex, (int2)(0, dy), (int4)(sy, ibeta, 0, 0));
}
__kernel void resizeLN(__read_only image2d_t src, __write_only image2d_t dst, __read_only image2d_t coeffsTex) {
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx, sy, sx0, sx1, sy0, sy1;
short ialpha0, ialpha1, ibeta0, ibeta1;
int4 v0, v1, v2, v3, res;
float4 f0, f1, f2, f3, fres;
sx = read_imagei(coeffsTex, (int2)(dx, 0)).x;
ialpha0 = read_imagei(coeffsTex, (int2)(dx, 0)).y;
ialpha1 = INTER_RESIZE_COEF_SCALE - ialpha0;
sy = read_imagei(coeffsTex, (int2)(0, dy)).x;
ibeta0 = read_imagei(coeffsTex, (int2)(0, dy)).y;
ibeta1 = INTER_RESIZE_COEF_SCALE - ibeta0;
sx0 = sx * cn;
sx1 = sx0 + cn;
sy0 = sy * src_stride;
sy1 = sy0 + src_stride;
sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
v0 = read_imagei(src, sampler, (int2)(sx0, sy0));
v1 = read_imagei(src, sampler, (int2)(sx1, sy0));
v2 = read_imagei(src, sampler, (int2)(sx0, sy1));
v3 = read_imagei(src, sampler, (int2)(sx1, sy1));
f0 = convert_float4(v0);
f1 = convert_float4(v1);
f2 = convert_float4(v2);
f3 = convert_float4(v3);
fres = (f0 * ialpha0 + f1 * ialpha1) * ibeta0 + (f2 * ialpha0 + f3 * ialpha1) * ibeta1;
fres = fres * (1.0f / (INTER_RESIZE_COEF_SCALE * INTER_RESIZE_COEF_SCALE));
res = convert_int4_sat_rte(fres);
write_imagei(dst, (int2)(dx + dst_offset, dy + dst_offset), res);
}
#elif defined INTER_NEAREST

View File

@ -3494,47 +3494,14 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
}
}
else if (interpolation == INTER_LINEAR_EXACT) {
AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width;
short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
float fxx, fyy;
int sx, sy;
for (int dx = 0; dx < dsize.width; dx++)
{
fxx = (float)((dx+0.5)*inv_fx - 0.5);
sx = cvFloor(fxx);
fxx -= sx;
if (sx < 0)
fxx = 0, sx = 0;
if (sx >= ssize.width-1)
fxx = 0, sx = ssize.width-1;
xofs[dx] = sx;
ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
}
for (int dy = 0; dy < dsize.height; dy++)
{
fyy = (float)((dy+0.5)*inv_fy - 0.5);
sy = cvFloor(fyy);
fyy -= sy;
yofs[dy] = sy;
ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
}
int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
UMat coeffs;
Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
char buf[2][50];
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s "
// Precompute the coefficients and store them in a texture
UMat coeffsTex(dsize.height + dsize.width, 2, CV_16SC1);
ocl::KernelArg coeffsTexArg = ocl::KernelArg::WriteOnlyNoSize(coeffsTex);
k.create("precomputeCoeffs", ocl::imgproc::resize_oclsrc,
format("-D PRECOMPUTE_COEFFS -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
@ -3543,9 +3510,25 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
cn, INTER_RESIZE_COEF_BITS));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(src), coeffsTexArg, (float)inv_fx * 0.5, (float)inv_fy * 0.5);
size_t globalThreads[2] = { (size_t)dsize.width, (size_t)dsize.height };
if (!k.run(2, globalThreads, NULL, false))
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
ocl::KernelArg::PtrReadOnly(coeffs));
// Use the texture in the interpolation kernel
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
"-D INTER_RESIZE_COEF_BITS=%d -D USE_TEXTURE",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
ocl::convertTypeStr(wdepth, depth, cn, buf[1], sizeof(buf[1])),
cn, INTER_RESIZE_COEF_BITS));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), coeffsTexArg);
if (!k.run(2, globalThreads, NULL, false))
return false;
}
else if (interpolation == INTER_NEAREST)
{