tesseract/opencl/oclkernels.h

1209 lines
34 KiB
C
Raw Normal View History

#ifndef _OCL_KERNEL_H_
#define _OCL_KERNEL_H_
#ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__ "\n"
// Double precision is a default of spreadsheets
// cl_khr_fp64: Khronos extension
// cl_amd_fp64: AMD extension
// use build option outside to define fp_t
/////////////////////////////////////////////
const char *kernel_src = KERNEL(
\n#ifdef KHR_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
\n#elif AMD_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n
\n#endif\n
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
{
int i = get_global_id(1);
int j = get_global_id(0);
int tiffword,rval,gval,bval;
//Ignore the excess
if ((i >= h) || (j >= w))
return;
tiffword = tiffdata[i * w + j];
rval = ((tiffword) & 0xff);
gval = (((tiffword) >> 8) & 0xff);
bval = (((tiffword) >> 16) & 0xff);
output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
}
)
KERNEL(
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(dword + pos) &= ~(*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixSubtract(__global int *dword, __global int *sword,
const int wpl, const int h, __global int *outword)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) & ~(*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) & (*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) | (*(sword + pos));
}\n
)
KERNEL(
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const unsigned int pos = get_global_id(0);
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
const int col = pos % wpl;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0;
else
nextword = *(sword + pos + 1);
//Loop unrolled
//1 bit to left and 1 bit to right
//Get the max value on LHS of every pixel
tempword = (prevword << (31)) | ((currword >> 1));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << 1) | (nextword >> (31));
destword |= tempword;
//2 bit to left and 2 bit to right
//Get the max value on LHS of every pixel
tempword = (prevword << (30)) | ((currword >> 2));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << 2) | (nextword >> (30));
destword |= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//2 words above
i = (row - 2) < 0 ? row : (row - 2);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//1 word above
i = (row - 1) < 0 ? row : (row - 1);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//1 word below
i = (row >= (h - 1)) ? row : (row + 1);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//2 words below
i = (row >= (h - 2)) ? row : (row + 2);
tempword = *(sword + i*wpl + col);
destword |= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int parbitsxp, parbitsxn, nwords;
unsigned int destword, tempword, lastword, currword;
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
int i, j, siter, eiter;
//Ignore the execss
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
return;
currword = *(sword + pos);
destword = currword;
parbitsxp = xp & 31;
parbitsxn = xn & 31;
nwords = xp >> 5;
if (parbitsxp > 0)
nwords += 1;
else
parbitsxp = 31;
siter = (col - nwords);
eiter = (col + nwords);
//Get prev word
if (col==0)
firstword = 0x0;
else
firstword = *(sword + pos - 1);
//Get next word
if (col == (wpl - 1))
secondword = 0x0;
else
secondword = *(sword + pos + 1);
//Last partial bits on either side
for (i = 1; i <= parbitsxp; i++)
{
//Get the max value on LHS of every pixel
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << i) | (secondword >> (32 - i));
destword |= tempword;
}
//Return if halfwidth <= 1 word
if (nwords == 1)
{
if (xn == 32)
{
destword |= firstword;
}
if (xp == 32)
{
destword |= secondword;
}
*(dword + pos) = destword;
return;
}
if (siter < 0)
firstword = 0x0;
else
firstword = *(sword + row*wpl + siter);
if (eiter >= wpl)
lastword = 0x0;
else
lastword = *(sword + row*wpl + eiter);
for ( i = 1; i < nwords; i++)
{
//Gets LHS words
if ((siter + i) < 0)
secondword = 0x0;
else
secondword = *(sword + row*wpl + siter + i);
lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
firstword = secondword;
if ((siter + i + 1) < 0)
secondword = 0x0;
else
secondword = *(sword + row*wpl + siter + i + 1);
lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
//Gets RHS words
if ((eiter - i) >= wpl)
firstword = 0x0;
else
firstword = *(sword + row*wpl + eiter - i);
rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
lastword = firstword;
if ((eiter - i - 1) >= wpl)
firstword = 0x0;
else
firstword = *(sword + row*wpl + eiter - i - 1);
rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
for (j = 1; j < 32; j++)
{
//OR LHS full words
tempword = (lprevword << j) | (lnextword >> (32 - j));
destword |= tempword;
//OR RHS full words
tempword = (rprevword << j) | (rnextword >> (32 - j));
destword |= tempword;
}
destword |= lprevword;
destword |= lnextword;
destword |= rprevword;
destword |= rnextword;
lastword = firstword;
firstword = secondword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
const int halfwidth,
const int wpl, const int h,
const char isEven)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
int i;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0;
else
nextword = *(sword + pos + 1);
for (i = 1; i <= halfwidth; i++)
{
//Get the max value on LHS of every pixel
if (i == halfwidth && isEven)
{
tempword = 0x0;
}
else
{
tempword = (prevword << (32-i)) | ((currword >> i));
}
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << i) | (nextword >> (32 - i));
destword |= tempword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
const int yp,
const int wpl, const int h,
const int yn)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i, siter, eiter;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//Set start position and end position considering the boundary conditions
siter = (row - yn) < 0 ? 0 : (row - yn);
eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
for (i = siter; i <= eiter; i++)
{
tempword = *(sword + i*wpl + col);
destword |= tempword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const unsigned int pos = get_global_id(0);
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
const int col = pos % wpl;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0xffffffff;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0xffffffff;
else
nextword = *(sword + pos + 1);
//Loop unrolled
//1 bit to left and 1 bit to right
//Get the min value on LHS of every pixel
tempword = (prevword << (31)) | ((currword >> 1));
destword &= tempword;
//Get min value on RHS of every pixel
tempword = (currword << 1) | (nextword >> (31));
destword &= tempword;
//2 bit to left and 2 bit to right
//Get the min value on LHS of every pixel
tempword = (prevword << (30)) | ((currword >> 2));
destword &= tempword;
//Get min value on RHS of every pixel
tempword = (currword << 2) | (nextword >> (30));
destword &= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
const int wpl, const int h,
const int fwmask, const int lwmask)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
if (row < 2 || row >= (h - 2))
{
destword = 0x0;
}
else
{
//2 words above
//i = (row - 2) < 0 ? row : (row - 2);
i = (row - 2);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//1 word above
//i = (row - 1) < 0 ? row : (row - 1);
i = (row - 1);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//1 word below
//i = (row >= (h - 1)) ? row : (row + 1);
i = (row + 1);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//2 words below
//i = (row >= (h - 2)) ? row : (row + 2);
i = (row + 2);
tempword = *(sword + i*wpl + col);
destword &= tempword;
if (col == 0)
{
destword &= fwmask;
}
if (col == (wpl - 1))
{
destword &= lwmask;
}
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
const int h, const char isAsymmetric, const int rwmask, const int lwmask)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int parbitsxp, parbitsxn, nwords;
unsigned int destword, tempword, lastword, currword;
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
int i, j, siter, eiter;
//Ignore the execss
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
return;
currword = *(sword + pos);
destword = currword;
parbitsxp = xp & 31;
parbitsxn = xn & 31;
nwords = xp >> 5;
if (parbitsxp > 0)
nwords += 1;
else
parbitsxp = 31;
siter = (col - nwords);
eiter = (col + nwords);
//Get prev word
if (col==0)
firstword = 0xffffffff;
else
firstword = *(sword + pos - 1);
//Get next word
if (col == (wpl - 1))
secondword = 0xffffffff;
else
secondword = *(sword + pos + 1);
//Last partial bits on either side
for (i = 1; i <= parbitsxp; i++)
{
//Get the max value on LHS of every pixel
tempword = (firstword << (32-i)) | ((currword >> i));
destword &= tempword;
//Get max value on RHS of every pixel
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
//tempword = (currword << i) | (secondword >> (32 - i));
destword &= tempword;
}
//Return if halfwidth <= 1 word
if (nwords == 1)
{
if (xp == 32)
{
destword &= firstword;
}
if (xn == 32)
{
destword &= secondword;
}
//Clear boundary pixels
if (isAsymmetric)
{
if (col == 0)
destword &= rwmask;
if (col == (wpl - 1))
destword &= lwmask;
}
*(dword + pos) = destword;
return;
}
if (siter < 0)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + siter);
if (eiter >= wpl)
lastword = 0xffffffff;
else
lastword = *(sword + row*wpl + eiter);
for ( i = 1; i < nwords; i++)
{
//Gets LHS words
if ((siter + i) < 0)
secondword = 0xffffffff;
else
secondword = *(sword + row*wpl + siter + i);
lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
firstword = secondword;
if ((siter + i + 1) < 0)
secondword = 0xffffffff;
else
secondword = *(sword + row*wpl + siter + i + 1);
lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
//Gets RHS words
if ((eiter - i) >= wpl)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + eiter - i);
rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
lastword = firstword;
if ((eiter - i - 1) >= wpl)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + eiter - i - 1);
rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
for (j = 0; j < 32; j++)
{
//OR LHS full words
tempword = (lprevword << j) | (lnextword >> (32 - j));
destword &= tempword;
//OR RHS full words
tempword = (rprevword << j) | (rnextword >> (32 - j));
destword &= tempword;
}
destword &= lprevword;
destword &= lnextword;
destword &= rprevword;
destword &= rnextword;
lastword = firstword;
firstword = secondword;
}
if (isAsymmetric)
{
//Clear boundary pixels
if (col < (nwords - 1))
destword = 0x0;
else if (col == (nwords - 1))
destword &= rwmask;
else if (col > (wpl - nwords))
destword = 0x0;
else if (col == (wpl - nwords))
destword &= lwmask;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
const int halfwidth, const int wpl,
const int h, const char clearBoundPixH,
const int rwmask, const int lwmask,
const char isEven)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int prevword, nextword, currword,tempword, destword;
int i;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0xffffffff;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0xffffffff;
else
nextword = *(sword + pos + 1);
for (i = 1; i <= halfwidth; i++)
{
//Get the min value on LHS of every pixel
tempword = (prevword << (32-i)) | ((currword >> i));
destword &= tempword;
//Get min value on RHS of every pixel
if (i == halfwidth && isEven)
{
tempword = 0xffffffff;
}
else
{
tempword = (currword << i) | (nextword >> (32 - i));
}
destword &= tempword;
}
if (clearBoundPixH)
{
if (col == 0)
{
destword &= rwmask;
}
else if (col == (wpl - 1))
{
destword &= lwmask;
}
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
const int yp,
const int wpl, const int h,
const char clearBoundPixV, const int yn)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword, destword;
int i, siter, eiter;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//Set start position and end position considering the boundary conditions
siter = (row - yp) < 0 ? 0 : (row - yp);
eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
for (i = siter; i <= eiter; i++)
{
tempword = *(sword + i*wpl + col);
destword &= tempword;
}
//Clear boundary pixels
if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
{
destword = 0x0;
}
*(dword + pos) = destword;
}\n
)
// HistogramRect Kernel: Accumulate
// assumes 4 channels, i.e., bytes_per_pixel = 4
// assumes number of pixels is multiple of 8
// data is laid out as
// ch0 ch1 ...
// bin0 bin1 bin2... bin0...
// rpt0,1,2...256 rpt0,1,2...
KERNEL(
\n#define HIST_REDUNDANCY 256\n
\n#define GROUP_SIZE 256\n
\n#define HIST_SIZE 256\n
\n#define NUM_CHANNELS 4\n
\n#define HR_UNROLL_SIZE 8 \n
\n#define HR_UNROLL_TYPE uchar8 \n
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannels(
__global const uchar8 *data,
uint numPixels,
__global uint *histBuffer) {
// declare variables
uchar8 pixels;
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
// for each pixel/channel, accumulate in global memory
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
pixels = data[pc];
// channel bin thread
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
}
}
)
KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannel(
__global const uchar8 *data,
uint numPixels,
__global uint *histBuffer) {
// declare variables
uchar8 pixels;
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
// for each pixel/channel, accumulate in global memory
for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
pixels = data[pc];
// bin thread
atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
}
}
)
KERNEL(
// unused
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannels_Grey(
\n __global const uchar* data,
\n uint numPixels,
\n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
\n
\n /* declare variables */
\n
\n // work indices
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n
\n /* accumulate in global memory */
\n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
\n uchar value = data[ pc ];
\n int idx = value * get_global_size(0) + get_global_id(0);
\n histBuffer[ idx ]++;
\n
\n }
\n
\n } // kernel_HistogramRectAllChannels_Grey
)
// HistogramRect Kernel: Reduction
// only supports 4 channels
// each work group handles a single channel of a single histogram bin
KERNEL(
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannelsReduction(
int n, // unused pixel redundancy
__global uint *histBuffer,
__global int* histResult) {
// declare variables
int channel = get_group_id(0)/HIST_SIZE;
int bin = get_group_id(0)%HIST_SIZE;
int value = 0;
// accumulate in register
for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
}
// reduction in local memory
__local int localHist[GROUP_SIZE];
localHist[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
if (get_local_id(0) < stride) {
value = localHist[ get_local_id(0)+stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride) {
localHist[ get_local_id(0)] += value;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write reduction to final result
if (get_local_id(0) == 0) {
histResult[get_group_id(0)] = localHist[0];
}
} // kernel_HistogramRectAllChannels
)
KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannelReduction(
int n, // unused pixel redundancy
__global uint *histBuffer,
__global int* histResult) {
// declare variables
// int channel = get_group_id(0)/HIST_SIZE;
int bin = get_group_id(0)%HIST_SIZE;
int value = 0;
// accumulate in register
for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
value += histBuffer[ bin*HIST_REDUNDANCY+i];
}
// reduction in local memory
__local int localHist[GROUP_SIZE];
localHist[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
if (get_local_id(0) < stride) {
value = localHist[ get_local_id(0)+stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride) {
localHist[ get_local_id(0)] += value;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write reduction to final result
if (get_local_id(0) == 0) {
histResult[get_group_id(0)] = localHist[0];
}
} // kernel_HistogramRectOneChannelReduction
)
KERNEL(
// unused
// each work group (x256) handles a histogram bin
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannelsReduction_Grey(
\n int n, // pixel redundancy that needs to be accumulated
\n __global uint *histBuffer,
\n __global uint* histResult) { // each wg accumulates 1 bin
\n
\n /* declare variables */
\n
\n // work indices
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n unsigned int hist = 0;
\n
\n /* accumulate in global memory */
\n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
\n hist += histBuffer[ (get_group_id(0)*n + p)];
\n }
\n
\n /* reduction in local memory */
\n // populate local memory
\n __local unsigned int localHist[GROUP_SIZE];
\n localHist[localId] = hist;
\n barrier(CLK_LOCAL_MEM_FENCE);
\n
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n if (localId < stride) {
\n hist = localHist[ (localId+stride)];
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n if (localId < stride) {
\n localHist[ localId] += hist;
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n }
\n
\n if (localId == 0)
\n histResult[get_group_id(0)] = localHist[0];
\n
\n } // kernel_HistogramRectAllChannelsReduction_Grey
)
// ThresholdRectToPix Kernel
// only supports 4 channels
// imageData is input image (24-bits/pixel)
// pix is output image (1-bit/pixel)
KERNEL(
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
\n#define CHAR_VEC_WIDTH 4 \n
\n#define PIXELS_PER_WORD 32 \n
\n#define PIXELS_PER_BURST 8 \n
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
typedef union {
uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
uchar4 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
} charVec;
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix(
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
__global const uchar4 *imageData,
int height,
int width,
int wpl, // words per line
__global int *thresholds,
__global int *hi_values,
__global int *pix) {
// declare variables
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
uint pad = PIXELS_PER_WORD * wpl - width;//number of padding bits at the end of each output line
int pThresholds[NUM_CHANNELS];
int pHi_Values[NUM_CHANNELS];
for ( int i = 0; i < NUM_CHANNELS; i++) {
pThresholds[i] = thresholds[i];
pHi_Values[i] = hi_values[i];
}
// for each word (32 pixels) in output image
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
unsigned int word = 0; // all bits start at zero
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
//decrease the pixel index for the padding at the end of each output line (=number of lines * padding)
uint pxIdxOffset = ( w / wpl) * pad;// = ( ( PIXELS_PER_WORD * w) / ( width + pad)) * pad;
// for each burst in word
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
// load burst
charVec pixels;
for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i - pxIdxOffset];
}
// for each pixel in burst
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
for ( int c = 0; c < NUM_CHANNELS; c++) {
unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
Issue 1351: OpenCL build - kernel_ThresholdRectToPix() not accounting for padding bits in the output pix?! https://code.google.com/p/tesseract-ocr/issues/detail?id=1351 What steps will reproduce the problem? 1.Use tesseract build with OpenCL. 2.Pass full color image with width which is not multiple of 32. 3.Recognition is way too slow and does not recognize anything. I read the article on http://www.sk-spell.sk.cx/tesseract-meets-the-opencl-first-test and decided to give OCL a try. The initial result was as per point 3 above. After some debugging I figured the problem is that the OCL version of threshold rect generation does not account for padding bits in the output pix lines. To prove my discovery I made a quick fix in oclkernels.h replacing the definition of kernel_ThresholdRectToPix Just a reminder: it is necessary to force OCL kernel recompilation after changing this source (e.g. delete “kernel - <device>.bin” from the exec folder). The fix is working but I am not sure about it since the original source apparently works for other people (as per the article). If I am right the OS/GPU are irrelevant since the bug is algorithmic, but mine are Windows/AMD. Also similar fix is applicable to kernel_ThresholdRectToPix_OneChan(), but there the input array might have some padding bytes as well, so its indexing will need further adjustments. I can come with some prove/fix for it either - I have not played with it yet. Disclaimer: I have no prior experience with image processing and tesseract source or with GPU computing and OpenCL (but please do explain if I am wrong).
2014-10-23 00:28:50 +08:00
word |= (((uint)0x80000000) >> ((b*PIXELS_PER_BURST+p)&31));
}
}
}
}
pix[w] = word;
}
}
\n#define CHAR_VEC_WIDTH 8 \n
\n#define PIXELS_PER_WORD 32 \n
\n#define PIXELS_PER_BURST 8 \n
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
typedef union {
uchar s[PIXELS_PER_BURST*1];
uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
} charVec1;
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix_OneChan(
__global const uchar8 *imageData,
int height,
int width,
int wpl, // words per line of output image
__global int *thresholds,
__global int *hi_values,
__global int *pix) {
// declare variables
int pThresholds[1];
int pHi_Values[1];
for ( int i = 0; i < 1; i++) {
pThresholds[i] = thresholds[i];
pHi_Values[i] = hi_values[i];
}
// for each word (32 pixels) in output image
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
unsigned int word = 0; // all bits start at zero
// for each burst in word
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
// load burst
charVec1 pixels;
// for each char8 in burst
pixels.v[0] = imageData[
w*BURSTS_PER_WORD
+ b
+ 0 ];
// for each pixel in burst
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
//int littleEndianIdx = p ^ 3;
//int bigEndianIdx = p;
int idx =
\n#ifdef __ENDIAN_LITTLE__\n
p ^ 3;
\n#else\n
p;
\n#endif\n
unsigned char pixChan = pixels.s[idx];
if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
}
}
}
pix[w] = word;
}
}
)
KERNEL(
\n#define RED_SHIFT 24\n
\n#define GREEN_SHIFT 16\n
\n#define BLUE_SHIFT 8\n
\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n
\n
\n__attribute__((reqd_work_group_size(256, 1, 1)))\n
\n__kernel\n
\nvoid kernel_RGBToGray(
__global const unsigned int *srcData,
__global unsigned char *dstData,
int srcWPL,
int dstWPL,
int height,
int width,
float rwt,
float gwt,
float bwt ) {
// pixel index
int pixelIdx = get_global_id(0);
if (pixelIdx >= height*width) return;
unsigned int word = srcData[pixelIdx];
int output = (rwt * ((word >> RED_SHIFT) & 0xff) +
gwt * ((word >> GREEN_SHIFT) & 0xff) +
bwt * ((word >> BLUE_SHIFT) & 0xff) + 0.5);
// SET_DATA_BYTE
dstData[pixelIdx] = output;
}
)
#endif
; // close char*
#endif // USE_EXTERNAL_KERNEL
//#endif //_OCL_KERNEL_H_
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */