mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 21:20:18 +08:00
increased number of rows per work-item
This commit is contained in:
parent
45f21e4f12
commit
c41a134394
@ -3582,7 +3582,9 @@ private:
|
||||
static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
|
||||
int interpolation, int borderType, const Scalar& borderValue)
|
||||
{
|
||||
int cn = _src.channels(), type = _src.type(), depth = _src.depth();
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
|
||||
rowsPerWI = dev.isIntel() ? 4 : 1;
|
||||
|
||||
if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
|
||||
|| _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
|
||||
@ -3619,12 +3621,14 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
|
||||
static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
|
||||
static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
|
||||
"BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
|
||||
String buildOptions = format("-D %s -D %s -D T=%s", interMap[interpolation], borderMap[borderType], ocl::typeToStr(type));
|
||||
String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
|
||||
interMap[interpolation], borderMap[borderType],
|
||||
ocl::typeToStr(type), rowsPerWI);
|
||||
|
||||
if (interpolation != INTER_NEAREST)
|
||||
{
|
||||
char cvt[3][40];
|
||||
int wdepth = std::max(CV_32F, dst.depth());
|
||||
int wdepth = std::max(CV_32F, depth);
|
||||
buildOptions = buildOptions
|
||||
+ format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
|
||||
" -D convertToWT2=%s -D WT2=%s",
|
||||
@ -3653,7 +3657,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
|
||||
else
|
||||
k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
|
||||
|
||||
size_t globalThreads[2] = { dst.cols, dst.rows };
|
||||
size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
|
||||
return k.run(2, globalThreads, NULL, false);
|
||||
}
|
||||
|
||||
|
@ -147,16 +147,21 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols)
|
||||
{
|
||||
T scalar = convertScalar(nVal);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset);
|
||||
int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset);
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
|
||||
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
|
||||
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
@ -175,10 +180,11 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
|
||||
}
|
||||
else
|
||||
{
|
||||
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset);
|
||||
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
|
||||
storepix(loadpix((__global const T*)(srcptr + src_index)), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -187,15 +193,19 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
T scalar = convertScalar(nVal);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset);
|
||||
T scalar = convertScalar(nVal);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map_index += map_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
|
||||
@ -210,10 +220,11 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
}
|
||||
else
|
||||
{
|
||||
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset);
|
||||
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
|
||||
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -222,15 +233,19 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
T scalar = convertScalar(nVal);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map_index = mad24(y, map_step, x * (int)sizeof(short2) + map_offset);
|
||||
T scalar = convertScalar(nVal);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map_index += map_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const short2 * map = (__global const short2 *)(mapptr + map_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
|
||||
@ -245,10 +260,11 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
}
|
||||
else
|
||||
{
|
||||
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset);
|
||||
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
|
||||
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -260,14 +276,18 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
T scalar = convertScalar(nVal);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset);
|
||||
int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset);
|
||||
T scalar = convertScalar(nVal);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
|
||||
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
|
||||
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
@ -286,10 +306,11 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
|
||||
}
|
||||
else
|
||||
{
|
||||
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset);
|
||||
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
|
||||
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#elif INTER_LINEAR
|
||||
@ -301,14 +322,19 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset);
|
||||
int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
|
||||
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
|
||||
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
@ -350,6 +376,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
|
||||
d * (u.x) * (u.y);
|
||||
storepix(convertToT(dst_data), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -359,14 +386,19 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset);
|
||||
int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
|
||||
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
|
||||
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
@ -409,6 +441,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
|
||||
d * (u.x) * (u.y);
|
||||
storepix(convertToT(dst_data), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -417,13 +450,18 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
ST nVal)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
|
||||
int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset);
|
||||
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
|
||||
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y,
|
||||
map_index += map_step, dst_index += dst_step)
|
||||
if (y < dst_rows)
|
||||
{
|
||||
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
|
||||
@ -464,6 +502,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
|
||||
d * (u.x) * (u.y);
|
||||
storepix(convertToT(dst_data), dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user