mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 21:20:18 +08:00
Optimize OpenCL version of sepFilter2D
This commit is contained in:
parent
d4f938ed56
commit
730ead44fe
@ -3492,7 +3492,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||
return false;
|
||||
|
||||
size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize };
|
||||
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
|
||||
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), optimizedSepFilterLocalSize};
|
||||
|
||||
char cvt[2][40];
|
||||
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
|
||||
|
@ -119,17 +119,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
int liy = get_local_id(1);
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
// calculate pixel position in source image taking image offset into account
|
||||
int srcX = x + srcOffsetX - RADIUSX;
|
||||
int srcY = y + srcOffsetY - RADIUSY;
|
||||
|
||||
// extrapolate coordinates, if needed
|
||||
// and read my own source pixel into local memory
|
||||
// with account for extra border pixels, which will be read by starting workitems
|
||||
int clocY = liy;
|
||||
int cSrcY = srcY;
|
||||
int cSrcY = liy + srcOffsetY - RADIUSY;
|
||||
do
|
||||
{
|
||||
int yb = cSrcY;
|
||||
@ -154,48 +152,76 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
while (clocY < BLK_Y+(RADIUSY*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do vertical filter pass
|
||||
// and store intermediate results to second local memory array
|
||||
int i, clocX = lix;
|
||||
WT sum = (WT) 0;
|
||||
do
|
||||
for (int y = 0; y < dst_rows; y+=BLK_Y)
|
||||
{
|
||||
sum = (WT) 0;
|
||||
for (i=0; i<=2*RADIUSY; i++)
|
||||
// do vertical filter pass
|
||||
// and store intermediate results to second local memory array
|
||||
int i, clocX = lix;
|
||||
WT sum = (WT) 0;
|
||||
do
|
||||
{
|
||||
sum = (WT) 0;
|
||||
for (i=0; i<=2*RADIUSY; i++)
|
||||
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
|
||||
#else
|
||||
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
|
||||
#endif
|
||||
lsmemDy[liy][clocX] = sum;
|
||||
clocX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
lsmemDy[liy][clocX] = sum;
|
||||
clocX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// if this pixel happened to be out of image borders because of global size rounding,
|
||||
// then just return
|
||||
if( x >= dst_cols || y >=dst_rows )
|
||||
return;
|
||||
|
||||
// do second horizontal filter pass
|
||||
// and calculate final result
|
||||
sum = 0.0f;
|
||||
for (i=0; i<=2*RADIUSX; i++)
|
||||
// if this pixel happened to be out of image borders because of global size rounding,
|
||||
// then just return
|
||||
if ((x < dst_cols) && (y + liy < dst_rows))
|
||||
{
|
||||
// do second horizontal filter pass
|
||||
// and calculate final result
|
||||
sum = 0.0f;
|
||||
for (i=0; i<=2*RADIUSX; i++)
|
||||
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
#else
|
||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
#endif
|
||||
|
||||
#ifdef INTEGER_ARITHMETIC
|
||||
#ifdef INTEL_DEVICE
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||
#else
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
#endif
|
||||
#endif
|
||||
// store result into destination image
|
||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||
}
|
||||
|
||||
for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
|
||||
{
|
||||
int clocX = i % (BLK_X+(RADIUSX*2));
|
||||
int clocY = i / (BLK_X+(RADIUSX*2));
|
||||
lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int cSrcY = y + BLK_Y + liy + srcOffsetY + RADIUSY;
|
||||
EXTRAPOLATE(cSrcY, (height));
|
||||
|
||||
clocX = lix;
|
||||
int cSrcX = x + srcOffsetX - RADIUSX;
|
||||
do
|
||||
{
|
||||
int xb = cSrcX;
|
||||
EXTRAPOLATE(xb,(width));
|
||||
lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, cSrcY, (width), (height), 0 );
|
||||
|
||||
clocX += BLK_X;
|
||||
cSrcX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// store result into destination image
|
||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user