mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-12 15:39:04 +08:00
07be522e43
https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
1231 lines
35 KiB
C
1231 lines
35 KiB
C
|
|
#ifndef _OCL_KERNEL_H_
|
|
#define _OCL_KERNEL_H_
|
|
#ifndef USE_EXTERNAL_KERNEL
|
|
#define KERNEL( ... )# __VA_ARGS__ "\n"
|
|
// Double precision is a default of spreadsheets
|
|
// cl_khr_fp64: Khronos extension
|
|
// cl_amd_fp64: AMD extension
|
|
// use build option outside to define fp_t
|
|
/////////////////////////////////////////////
|
|
const char *kernel_src = KERNEL(
|
|
\n#ifdef KHR_DP_EXTENSION\n
|
|
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
|
|
\n#elif AMD_DP_EXTENSION\n
|
|
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
|
|
\n#else\n
|
|
\n#endif\n
|
|
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
|
|
{
|
|
int i = get_global_id(1);
|
|
int j = get_global_id(0);
|
|
int tiffword,rval,gval,bval;
|
|
|
|
//Ignore the excess
|
|
if ((i >= h) || (j >= w))
|
|
return;
|
|
|
|
tiffword = tiffdata[i * w + j];
|
|
rval = ((tiffword) & 0xff);
|
|
gval = (((tiffword) >> 8) & 0xff);
|
|
bval = (((tiffword) >> 16) & 0xff);
|
|
output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
|
|
}
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
|
|
const int wpl, const int h)
|
|
{
|
|
const unsigned int row = get_global_id(1);
|
|
const unsigned int col = get_global_id(0);
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
*(dword + pos) &= ~(*(sword + pos));
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void pixSubtract(__global int *dword, __global int *sword,
|
|
const int wpl, const int h, __global int *outword)
|
|
{
|
|
const unsigned int row = get_global_id(1);
|
|
const unsigned int col = get_global_id(0);
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
*(outword + pos) = *(dword + pos) & ~(*(sword + pos));
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
|
|
const int wpl, const int h)
|
|
{
|
|
const unsigned int row = get_global_id(1);
|
|
const unsigned int col = get_global_id(0);
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
*(outword + pos) = *(dword + pos) & (*(sword + pos));
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
|
|
const int wpl, const int h)
|
|
{
|
|
const unsigned int row = get_global_id(1);
|
|
const unsigned int col = get_global_id(0);
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
*(outword + pos) = *(dword + pos) | (*(sword + pos));
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
|
|
const int wpl, const int h)
|
|
{
|
|
const unsigned int pos = get_global_id(0);
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
unsigned int destword;
|
|
const int col = pos % wpl;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl * h))
|
|
return;
|
|
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
//Handle boundary conditions
|
|
if(col==0)
|
|
prevword=0;
|
|
else
|
|
prevword = *(sword + pos - 1);
|
|
|
|
if(col==(wpl - 1))
|
|
nextword=0;
|
|
else
|
|
nextword = *(sword + pos + 1);
|
|
|
|
//Loop unrolled
|
|
|
|
//1 bit to left and 1 bit to right
|
|
//Get the max value on LHS of every pixel
|
|
tempword = (prevword << (31)) | ((currword >> 1));
|
|
destword |= tempword;
|
|
//Get max value on RHS of every pixel
|
|
tempword = (currword << 1) | (nextword >> (31));
|
|
destword |= tempword;
|
|
|
|
//2 bit to left and 2 bit to right
|
|
//Get the max value on LHS of every pixel
|
|
tempword = (prevword << (30)) | ((currword >> 2));
|
|
destword |= tempword;
|
|
//Get max value on RHS of every pixel
|
|
tempword = (currword << 2) | (nextword >> (30));
|
|
destword |= tempword;
|
|
|
|
|
|
*(dword + pos) = destword;
|
|
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
|
|
const int wpl, const int h)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int tempword;
|
|
unsigned int destword;
|
|
int i;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
destword = *(sword + pos);
|
|
|
|
//2 words above
|
|
i = (row - 2) < 0 ? row : (row - 2);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword |= tempword;
|
|
|
|
//1 word above
|
|
i = (row - 1) < 0 ? row : (row - 1);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword |= tempword;
|
|
|
|
//1 word below
|
|
i = (row >= (h - 1)) ? row : (row + 1);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword |= tempword;
|
|
|
|
//2 words below
|
|
i = (row >= (h - 2)) ? row : (row + 2);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword |= tempword;
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int parbitsxp, parbitsxn, nwords;
|
|
unsigned int destword, tempword, lastword, currword;
|
|
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
|
|
int i, j, siter, eiter;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
|
|
return;
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
parbitsxp = xp & 31;
|
|
parbitsxn = xn & 31;
|
|
nwords = xp >> 5;
|
|
|
|
if (parbitsxp > 0)
|
|
nwords += 1;
|
|
else
|
|
parbitsxp = 31;
|
|
|
|
siter = (col - nwords);
|
|
eiter = (col + nwords);
|
|
|
|
//Get prev word
|
|
if (col==0)
|
|
firstword = 0x0;
|
|
else
|
|
firstword = *(sword + pos - 1);
|
|
|
|
//Get next word
|
|
if (col == (wpl - 1))
|
|
secondword = 0x0;
|
|
else
|
|
secondword = *(sword + pos + 1);
|
|
|
|
//Last partial bits on either side
|
|
for (i = 1; i <= parbitsxp; i++)
|
|
{
|
|
//Get the max value on LHS of every pixel
|
|
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
|
|
|
|
destword |= tempword;
|
|
|
|
//Get max value on RHS of every pixel
|
|
tempword = (currword << i) | (secondword >> (32 - i));
|
|
destword |= tempword;
|
|
}
|
|
|
|
//Return if halfwidth <= 1 word
|
|
if (nwords == 1)
|
|
{
|
|
if (xn == 32)
|
|
{
|
|
destword |= firstword;
|
|
}
|
|
if (xp == 32)
|
|
{
|
|
destword |= secondword;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
return;
|
|
}
|
|
|
|
if (siter < 0)
|
|
firstword = 0x0;
|
|
else
|
|
firstword = *(sword + row*wpl + siter);
|
|
|
|
if (eiter >= wpl)
|
|
lastword = 0x0;
|
|
else
|
|
lastword = *(sword + row*wpl + eiter);
|
|
|
|
for ( i = 1; i < nwords; i++)
|
|
{
|
|
//Gets LHS words
|
|
if ((siter + i) < 0)
|
|
secondword = 0x0;
|
|
else
|
|
secondword = *(sword + row*wpl + siter + i);
|
|
|
|
lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
|
|
|
|
firstword = secondword;
|
|
|
|
if ((siter + i + 1) < 0)
|
|
secondword = 0x0;
|
|
else
|
|
secondword = *(sword + row*wpl + siter + i + 1);
|
|
|
|
lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
|
|
|
|
//Gets RHS words
|
|
if ((eiter - i) >= wpl)
|
|
firstword = 0x0;
|
|
else
|
|
firstword = *(sword + row*wpl + eiter - i);
|
|
|
|
rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
|
|
|
|
lastword = firstword;
|
|
if ((eiter - i - 1) >= wpl)
|
|
firstword = 0x0;
|
|
else
|
|
firstword = *(sword + row*wpl + eiter - i - 1);
|
|
|
|
rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
|
|
|
|
for (j = 1; j < 32; j++)
|
|
{
|
|
//OR LHS full words
|
|
tempword = (lprevword << j) | (lnextword >> (32 - j));
|
|
destword |= tempword;
|
|
|
|
//OR RHS full words
|
|
tempword = (rprevword << j) | (rnextword >> (32 - j));
|
|
destword |= tempword;
|
|
}
|
|
|
|
destword |= lprevword;
|
|
destword |= lnextword;
|
|
destword |= rprevword;
|
|
destword |= rnextword;
|
|
|
|
lastword = firstword;
|
|
firstword = secondword;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
|
|
const int halfwidth,
|
|
const int wpl, const int h,
|
|
const char isEven)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
unsigned int destword;
|
|
int i;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl * h))
|
|
return;
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
//Handle boundary conditions
|
|
if(col==0)
|
|
prevword=0;
|
|
else
|
|
prevword = *(sword + pos - 1);
|
|
|
|
if(col==(wpl - 1))
|
|
nextword=0;
|
|
else
|
|
nextword = *(sword + pos + 1);
|
|
|
|
for (i = 1; i <= halfwidth; i++)
|
|
{
|
|
//Get the max value on LHS of every pixel
|
|
if (i == halfwidth && isEven)
|
|
{
|
|
tempword = 0x0;
|
|
}
|
|
else
|
|
{
|
|
tempword = (prevword << (32-i)) | ((currword >> i));
|
|
}
|
|
|
|
destword |= tempword;
|
|
|
|
//Get max value on RHS of every pixel
|
|
tempword = (currword << i) | (nextword >> (32 - i));
|
|
|
|
destword |= tempword;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
|
|
const int yp,
|
|
const int wpl, const int h,
|
|
const int yn)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int tempword;
|
|
unsigned int destword;
|
|
int i, siter, eiter;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
destword = *(sword + pos);
|
|
|
|
//Set start position and end position considering the boundary conditions
|
|
siter = (row - yn) < 0 ? 0 : (row - yn);
|
|
eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
|
|
|
|
for (i = siter; i <= eiter; i++)
|
|
{
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword |= tempword;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
|
|
const int wpl, const int h)
|
|
{
|
|
const unsigned int pos = get_global_id(0);
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
unsigned int destword;
|
|
const int col = pos % wpl;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl * h))
|
|
return;
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
//Handle boundary conditions
|
|
if(col==0)
|
|
prevword=0xffffffff;
|
|
else
|
|
prevword = *(sword + pos - 1);
|
|
|
|
if(col==(wpl - 1))
|
|
nextword=0xffffffff;
|
|
else
|
|
nextword = *(sword + pos + 1);
|
|
|
|
//Loop unrolled
|
|
|
|
//1 bit to left and 1 bit to right
|
|
//Get the min value on LHS of every pixel
|
|
tempword = (prevword << (31)) | ((currword >> 1));
|
|
destword &= tempword;
|
|
//Get min value on RHS of every pixel
|
|
tempword = (currword << 1) | (nextword >> (31));
|
|
destword &= tempword;
|
|
|
|
//2 bit to left and 2 bit to right
|
|
//Get the min value on LHS of every pixel
|
|
tempword = (prevword << (30)) | ((currword >> 2));
|
|
destword &= tempword;
|
|
//Get min value on RHS of every pixel
|
|
tempword = (currword << 2) | (nextword >> (30));
|
|
destword &= tempword;
|
|
|
|
|
|
*(dword + pos) = destword;
|
|
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
|
|
const int wpl, const int h,
|
|
const int fwmask, const int lwmask)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int tempword;
|
|
unsigned int destword;
|
|
int i;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
destword = *(sword + pos);
|
|
|
|
if (row < 2 || row >= (h - 2))
|
|
{
|
|
destword = 0x0;
|
|
}
|
|
else
|
|
{
|
|
//2 words above
|
|
//i = (row - 2) < 0 ? row : (row - 2);
|
|
i = (row - 2);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword &= tempword;
|
|
|
|
//1 word above
|
|
//i = (row - 1) < 0 ? row : (row - 1);
|
|
i = (row - 1);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword &= tempword;
|
|
|
|
//1 word below
|
|
//i = (row >= (h - 1)) ? row : (row + 1);
|
|
i = (row + 1);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword &= tempword;
|
|
|
|
//2 words below
|
|
//i = (row >= (h - 2)) ? row : (row + 2);
|
|
i = (row + 2);
|
|
tempword = *(sword + i*wpl + col);
|
|
destword &= tempword;
|
|
|
|
if (col == 0)
|
|
{
|
|
destword &= fwmask;
|
|
}
|
|
if (col == (wpl - 1))
|
|
{
|
|
destword &= lwmask;
|
|
}
|
|
}
|
|
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
|
|
const int h, const char isAsymmetric, const int rwmask, const int lwmask)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int parbitsxp, parbitsxn, nwords;
|
|
unsigned int destword, tempword, lastword, currword;
|
|
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
|
|
int i, j, siter, eiter;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
|
|
return;
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
parbitsxp = xp & 31;
|
|
parbitsxn = xn & 31;
|
|
nwords = xp >> 5;
|
|
|
|
if (parbitsxp > 0)
|
|
nwords += 1;
|
|
else
|
|
parbitsxp = 31;
|
|
|
|
siter = (col - nwords);
|
|
eiter = (col + nwords);
|
|
|
|
//Get prev word
|
|
if (col==0)
|
|
firstword = 0xffffffff;
|
|
else
|
|
firstword = *(sword + pos - 1);
|
|
|
|
//Get next word
|
|
if (col == (wpl - 1))
|
|
secondword = 0xffffffff;
|
|
else
|
|
secondword = *(sword + pos + 1);
|
|
|
|
//Last partial bits on either side
|
|
for (i = 1; i <= parbitsxp; i++)
|
|
{
|
|
//Get the max value on LHS of every pixel
|
|
tempword = (firstword << (32-i)) | ((currword >> i));
|
|
destword &= tempword;
|
|
|
|
//Get max value on RHS of every pixel
|
|
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
|
|
|
|
//tempword = (currword << i) | (secondword >> (32 - i));
|
|
destword &= tempword;
|
|
}
|
|
|
|
//Return if halfwidth <= 1 word
|
|
if (nwords == 1)
|
|
{
|
|
if (xp == 32)
|
|
{
|
|
destword &= firstword;
|
|
}
|
|
if (xn == 32)
|
|
{
|
|
destword &= secondword;
|
|
}
|
|
|
|
//Clear boundary pixels
|
|
if (isAsymmetric)
|
|
{
|
|
if (col == 0)
|
|
destword &= rwmask;
|
|
if (col == (wpl - 1))
|
|
destword &= lwmask;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
return;
|
|
}
|
|
|
|
if (siter < 0)
|
|
firstword = 0xffffffff;
|
|
else
|
|
firstword = *(sword + row*wpl + siter);
|
|
|
|
if (eiter >= wpl)
|
|
lastword = 0xffffffff;
|
|
else
|
|
lastword = *(sword + row*wpl + eiter);
|
|
|
|
|
|
for ( i = 1; i < nwords; i++)
|
|
{
|
|
//Gets LHS words
|
|
if ((siter + i) < 0)
|
|
secondword = 0xffffffff;
|
|
else
|
|
secondword = *(sword + row*wpl + siter + i);
|
|
|
|
lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
|
|
|
|
firstword = secondword;
|
|
|
|
if ((siter + i + 1) < 0)
|
|
secondword = 0xffffffff;
|
|
else
|
|
secondword = *(sword + row*wpl + siter + i + 1);
|
|
|
|
lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
|
|
|
|
//Gets RHS words
|
|
if ((eiter - i) >= wpl)
|
|
firstword = 0xffffffff;
|
|
else
|
|
firstword = *(sword + row*wpl + eiter - i);
|
|
|
|
rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
|
|
|
|
lastword = firstword;
|
|
if ((eiter - i - 1) >= wpl)
|
|
firstword = 0xffffffff;
|
|
else
|
|
firstword = *(sword + row*wpl + eiter - i - 1);
|
|
|
|
rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
|
|
|
|
for (j = 0; j < 32; j++)
|
|
{
|
|
//OR LHS full words
|
|
tempword = (lprevword << j) | (lnextword >> (32 - j));
|
|
destword &= tempword;
|
|
|
|
//OR RHS full words
|
|
tempword = (rprevword << j) | (rnextword >> (32 - j));
|
|
destword &= tempword;
|
|
}
|
|
|
|
destword &= lprevword;
|
|
destword &= lnextword;
|
|
destword &= rprevword;
|
|
destword &= rnextword;
|
|
|
|
lastword = firstword;
|
|
firstword = secondword;
|
|
}
|
|
|
|
if (isAsymmetric)
|
|
{
|
|
//Clear boundary pixels
|
|
if (col < (nwords - 1))
|
|
destword = 0x0;
|
|
else if (col == (nwords - 1))
|
|
destword &= rwmask;
|
|
else if (col > (wpl - nwords))
|
|
destword = 0x0;
|
|
else if (col == (wpl - nwords))
|
|
destword &= lwmask;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
|
|
const int halfwidth, const int wpl,
|
|
const int h, const char clearBoundPixH,
|
|
const int rwmask, const int lwmask,
|
|
const char isEven)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int prevword, nextword, currword,tempword, destword;
|
|
int i;
|
|
|
|
//Ignore the execss
|
|
if (pos >= (wpl * h))
|
|
return;
|
|
|
|
currword = *(sword + pos);
|
|
destword = currword;
|
|
|
|
//Handle boundary conditions
|
|
if(col==0)
|
|
prevword=0xffffffff;
|
|
else
|
|
prevword = *(sword + pos - 1);
|
|
|
|
if(col==(wpl - 1))
|
|
nextword=0xffffffff;
|
|
else
|
|
nextword = *(sword + pos + 1);
|
|
|
|
for (i = 1; i <= halfwidth; i++)
|
|
{
|
|
//Get the min value on LHS of every pixel
|
|
tempword = (prevword << (32-i)) | ((currword >> i));
|
|
|
|
destword &= tempword;
|
|
|
|
//Get min value on RHS of every pixel
|
|
if (i == halfwidth && isEven)
|
|
{
|
|
tempword = 0xffffffff;
|
|
}
|
|
else
|
|
{
|
|
tempword = (currword << i) | (nextword >> (32 - i));
|
|
}
|
|
|
|
destword &= tempword;
|
|
}
|
|
|
|
if (clearBoundPixH)
|
|
{
|
|
if (col == 0)
|
|
{
|
|
destword &= rwmask;
|
|
}
|
|
else if (col == (wpl - 1))
|
|
{
|
|
destword &= lwmask;
|
|
}
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
KERNEL(
|
|
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
|
|
const int yp,
|
|
const int wpl, const int h,
|
|
const char clearBoundPixV, const int yn)
|
|
{
|
|
const int col = get_global_id(0);
|
|
const int row = get_global_id(1);
|
|
const unsigned int pos = row * wpl + col;
|
|
unsigned int tempword, destword;
|
|
int i, siter, eiter;
|
|
|
|
//Ignore the execss
|
|
if (row >= h || col >= wpl)
|
|
return;
|
|
|
|
destword = *(sword + pos);
|
|
|
|
//Set start position and end position considering the boundary conditions
|
|
siter = (row - yp) < 0 ? 0 : (row - yp);
|
|
eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
|
|
|
|
for (i = siter; i <= eiter; i++)
|
|
{
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword &= tempword;
|
|
}
|
|
|
|
//Clear boundary pixels
|
|
if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
|
|
{
|
|
destword = 0x0;
|
|
}
|
|
|
|
*(dword + pos) = destword;
|
|
}\n
|
|
)
|
|
|
|
// HistogramRect Kernel: Accumulate
|
|
// assumes 4 channels, i.e., bytes_per_pixel = 4
|
|
// assumes number of pixels is multiple of 8
|
|
// data is layed out as
|
|
// ch0 ch1 ...
|
|
// bin0 bin1 bin2... bin0...
|
|
// rpt0,1,2...256 rpt0,1,2...
|
|
KERNEL(
|
|
\n#define HIST_REDUNDANCY 256\n
|
|
\n#define GROUP_SIZE 256\n
|
|
\n#define HIST_SIZE 256\n
|
|
\n#define NUM_CHANNELS 4\n
|
|
\n#define HR_UNROLL_SIZE 8 \n
|
|
\n#define HR_UNROLL_TYPE uchar8 \n
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_HistogramRectAllChannels(
|
|
__global const uchar8 *data,
|
|
uint numPixels,
|
|
__global uint *histBuffer) {
|
|
|
|
// declare variables
|
|
uchar8 pixels;
|
|
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
|
|
|
|
// for each pixel/channel, accumulate in global memory
|
|
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
|
|
pixels = data[pc];
|
|
// channel bin thread
|
|
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
|
|
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
|
|
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
|
|
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
|
|
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
|
|
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
|
|
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
|
|
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
|
|
}
|
|
}
|
|
)
|
|
|
|
KERNEL(
|
|
// NUM_CHANNELS = 1
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_HistogramRectOneChannel(
|
|
__global const uchar8 *data,
|
|
uint numPixels,
|
|
__global uint *histBuffer) {
|
|
|
|
// declare variables
|
|
uchar8 pixels;
|
|
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
|
|
|
|
// for each pixel/channel, accumulate in global memory
|
|
for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
|
|
pixels = data[pc];
|
|
// bin thread
|
|
atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
|
|
atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
|
|
}
|
|
}
|
|
)
|
|
|
|
|
|
KERNEL(
|
|
// unused
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
\n __kernel
|
|
\n void kernel_HistogramRectAllChannels_Grey(
|
|
\n __global const uchar* data,
|
|
\n uint numPixels,
|
|
\n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
|
|
\n
|
|
\n /* declare variables */
|
|
\n
|
|
\n // work indices
|
|
\n size_t groupId = get_group_id(0);
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
\n uint numThreads = get_global_size(0);
|
|
\n
|
|
\n /* accumulate in global memory */
|
|
\n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
|
|
\n uchar value = data[ pc ];
|
|
\n int idx = value * get_global_size(0) + get_global_id(0);
|
|
\n histBuffer[ idx ]++;
|
|
\n
|
|
\n }
|
|
\n
|
|
\n } // kernel_HistogramRectAllChannels_Grey
|
|
|
|
)
|
|
|
|
// HistogramRect Kernel: Reduction
|
|
// only supports 4 channels
|
|
// each work group handles a single channel of a single histogram bin
|
|
KERNEL(
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_HistogramRectAllChannelsReduction(
|
|
int n, // unused pixel redundancy
|
|
__global uint *histBuffer,
|
|
__global int* histResult) {
|
|
|
|
// declare variables
|
|
int channel = get_group_id(0)/HIST_SIZE;
|
|
int bin = get_group_id(0)%HIST_SIZE;
|
|
int value = 0;
|
|
|
|
// accumulate in register
|
|
for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
|
|
value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
|
|
}
|
|
|
|
// reduction in local memory
|
|
__local int localHist[GROUP_SIZE];
|
|
localHist[get_local_id(0)] = value;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
if (get_local_id(0) < stride) {
|
|
value = localHist[ get_local_id(0)+stride];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if (get_local_id(0) < stride) {
|
|
localHist[ get_local_id(0)] += value;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write reduction to final result
|
|
if (get_local_id(0) == 0) {
|
|
histResult[get_group_id(0)] = localHist[0];
|
|
}
|
|
} // kernel_HistogramRectAllChannels
|
|
)
|
|
|
|
|
|
KERNEL(
|
|
// NUM_CHANNELS = 1
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_HistogramRectOneChannelReduction(
|
|
int n, // unused pixel redundancy
|
|
__global uint *histBuffer,
|
|
__global int* histResult) {
|
|
|
|
// declare variables
|
|
// int channel = get_group_id(0)/HIST_SIZE;
|
|
int bin = get_group_id(0)%HIST_SIZE;
|
|
int value = 0;
|
|
|
|
// accumulate in register
|
|
for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
|
|
value += histBuffer[ bin*HIST_REDUNDANCY+i];
|
|
}
|
|
|
|
// reduction in local memory
|
|
__local int localHist[GROUP_SIZE];
|
|
localHist[get_local_id(0)] = value;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
if (get_local_id(0) < stride) {
|
|
value = localHist[ get_local_id(0)+stride];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if (get_local_id(0) < stride) {
|
|
localHist[ get_local_id(0)] += value;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write reduction to final result
|
|
if (get_local_id(0) == 0) {
|
|
histResult[get_group_id(0)] = localHist[0];
|
|
}
|
|
} // kernel_HistogramRectOneChannelReduction
|
|
)
|
|
|
|
|
|
KERNEL(
|
|
// unused
|
|
// each work group (x256) handles a histogram bin
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
\n __kernel
|
|
\n void kernel_HistogramRectAllChannelsReduction_Grey(
|
|
\n int n, // pixel redundancy that needs to be accumulated
|
|
\n __global uint *histBuffer,
|
|
\n __global uint* histResult) { // each wg accumulates 1 bin
|
|
\n
|
|
\n /* declare variables */
|
|
\n
|
|
\n // work indices
|
|
\n size_t groupId = get_group_id(0);
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
\n uint numThreads = get_global_size(0);
|
|
\n unsigned int hist = 0;
|
|
\n
|
|
\n /* accumulate in global memory */
|
|
\n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
|
|
\n hist += histBuffer[ (get_group_id(0)*n + p)];
|
|
\n }
|
|
\n
|
|
\n /* reduction in local memory */
|
|
\n // populate local memory
|
|
\n __local unsigned int localHist[GROUP_SIZE];
|
|
|
|
\n localHist[localId] = hist;
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n
|
|
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
\n if (localId < stride) {
|
|
\n hist = localHist[ (localId+stride)];
|
|
\n }
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n if (localId < stride) {
|
|
\n localHist[ localId] += hist;
|
|
\n }
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n }
|
|
\n
|
|
\n if (localId == 0)
|
|
\n histResult[get_group_id(0)] = localHist[0];
|
|
\n
|
|
\n } // kernel_HistogramRectAllChannelsReduction_Grey
|
|
|
|
)
|
|
|
|
// ThresholdRectToPix Kernel
|
|
// only supports 4 channels
|
|
// imageData is input image (24-bits/pixel)
|
|
// pix is output image (1-bit/pixel)
|
|
KERNEL(
|
|
\n#define CHAR_VEC_WIDTH 4 \n
|
|
\n#define PIXELS_PER_WORD 32 \n
|
|
\n#define PIXELS_PER_BURST 8 \n
|
|
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
|
|
typedef union {
|
|
uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
|
|
uchar4 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
|
|
} charVec;
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_ThresholdRectToPix(
|
|
__global const uchar4 *imageData,
|
|
int height,
|
|
int width,
|
|
int wpl, // words per line
|
|
__global int *thresholds,
|
|
__global int *hi_values,
|
|
__global int *pix) {
|
|
|
|
// declare variables
|
|
uint pad = PIXELS_PER_WORD * wpl - width;//number of padding bits at the end of each output line
|
|
int pThresholds[NUM_CHANNELS];
|
|
int pHi_Values[NUM_CHANNELS];
|
|
for ( int i = 0; i < NUM_CHANNELS; i++) {
|
|
pThresholds[i] = thresholds[i];
|
|
pHi_Values[i] = hi_values[i];
|
|
}
|
|
|
|
// for each word (32 pixels) in output image
|
|
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
|
|
unsigned int word = 0; // all bits start at zero
|
|
//decrease the pixel index for the padding at the end of each output line (=number of lines * padding)
|
|
uint pxIdxOffset = ( w / wpl) * pad;// = ( ( PIXELS_PER_WORD * w) / ( width + pad)) * pad;
|
|
// for each burst in word
|
|
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
|
|
// load burst
|
|
charVec pixels;
|
|
for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
|
|
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i - pxIdxOffset];
|
|
}
|
|
|
|
// for each pixel in burst
|
|
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
|
for ( int c = 0; c < NUM_CHANNELS; c++) {
|
|
unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
|
|
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
|
word |= (((uint)0x80000000) >> ((b*PIXELS_PER_BURST+p)&31));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
pix[w] = word;
|
|
}
|
|
}
|
|
|
|
// only supports 1 channel
|
|
typedef union {
|
|
uchar s[PIXELS_PER_BURST];
|
|
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
|
|
} charVec1;
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__kernel
|
|
void kernel_ThresholdRectToPix_OneChan(
|
|
__global const uchar8 *imageData,
|
|
int height,
|
|
int width,
|
|
int wpl, // words per line
|
|
__global int *thresholds,
|
|
__global int *hi_values,
|
|
__global int *pix) {
|
|
|
|
// declare variables
|
|
int pThresholds[1];
|
|
int pHi_Values[1];
|
|
for ( int i = 0; i < 1; i++) {
|
|
pThresholds[i] = thresholds[i];
|
|
pHi_Values[i] = hi_values[i];
|
|
}
|
|
|
|
// for each word (32 pixels) in output image
|
|
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
|
|
unsigned int word = 0; // all bits start at zero
|
|
|
|
// for each burst in word
|
|
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
|
|
|
|
// load burst
|
|
charVec1 pixels;
|
|
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
|
|
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
|
|
}
|
|
|
|
// for each pixel in burst
|
|
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
|
for ( int c = 0; c < 1; c++) {
|
|
unsigned char pixChan = pixels.s[p + c];
|
|
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
|
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
pix[w] = word;
|
|
}
|
|
}
|
|
)
|
|
|
|
; // close char*
|
|
|
|
#endif // USE_EXTERNAL_KERNEL
|
|
#endif //_OCL_KERNEL_H_
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
|
|
|
// Alternative histogram kernel written to use uchar and different global memory scattered write
|
|
// was a little better for intel platforms but still not faster then native serial code
|
|
#if 0
|
|
/* data layed out as
|
|
bin0 bin1 bin2...
|
|
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
|
|
*/
|
|
\n__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
\n __kernel
|
|
\n void kernel_HistogramRectAllChannels_uchar(
|
|
\n volatile __global const uchar *data,
|
|
\n uint numPixels,
|
|
\n volatile __global uint *histBuffer) {
|
|
\n
|
|
\n // for each pixel/channel, accumulate in global memory
|
|
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
|
|
\n uchar value = data[pc];
|
|
\n int idx = value*get_global_size(0) + get_global_id(0);
|
|
\n histBuffer[ idx ]++; // coalesced if same value
|
|
\n }
|
|
\n } // kernel_HistogramRectAllChannels
|
|
\n
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
\n __kernel
|
|
\n void kernel_HistogramRectAllChannelsReduction_uchar(
|
|
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
|
|
\n __global uint4 *histBuffer,
|
|
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
|
|
\n
|
|
\n // declare variables
|
|
\n int binIdx = get_group_id(0);
|
|
\n size_t groupId = get_group_id(0);
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
\n uint numThreads = get_global_size(0);
|
|
\n uint4 hist = {0, 0, 0, 0};
|
|
\n
|
|
\n // accumulate in register
|
|
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
|
|
\n hist += histBuffer[binIdx*n+p];
|
|
\n }
|
|
\n
|
|
\n // reduction in local memory
|
|
\n __local uint4 localHist[GROUP_SIZE];
|
|
\n localHist[localId] = hist;
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n
|
|
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
\n if (localId < stride) {
|
|
\n hist = localHist[ localId+stride];
|
|
\n }
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n if (localId < stride) {
|
|
\n localHist[ localId] += hist;
|
|
\n }
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
\n }
|
|
\n
|
|
\n // write reduction to final result
|
|
\n if (localId == 0) {
|
|
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
|
|
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
|
|
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
|
|
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
|
|
\n }
|
|
\n
|
|
\n } // kernel_HistogramRectAllChannels
|
|
#endif
|