tesseract/opencl/oclkernels.h
2014-01-17 18:51:27 +00:00

1230 lines
35 KiB
C

#ifndef _OCL_KERNEL_H_
#define _OCL_KERNEL_H_
#ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__ "\n"
// Double precision is a default of spreadsheets
// cl_khr_fp64: Khronos extension
// cl_amd_fp64: AMD extension
// use build option outside to define fp_t
/////////////////////////////////////////////
const char *kernel_src = KERNEL(
\n#ifdef KHR_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
\n#elif AMD_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n
\n#endif\n
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
{
int i = get_global_id(1);
int j = get_global_id(0);
int tiffword,rval,gval,bval;
//Ignore the excess
if ((i >= h) || (j >= w))
return;
tiffword = tiffdata[i * w + j];
rval = ((tiffword) & 0xff);
gval = (((tiffword) >> 8) & 0xff);
bval = (((tiffword) >> 16) & 0xff);
output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
}
)
KERNEL(
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(dword + pos) &= ~(*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixSubtract(__global int *dword, __global int *sword,
const int wpl, const int h, __global int *outword)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) & ~(*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) & (*(sword + pos));
}\n
)
KERNEL(
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
const int wpl, const int h)
{
const unsigned int row = get_global_id(1);
const unsigned int col = get_global_id(0);
const unsigned int pos = row * wpl + col;
//Ignore the execss
if (row >= h || col >= wpl)
return;
*(outword + pos) = *(dword + pos) | (*(sword + pos));
}\n
)
KERNEL(
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const unsigned int pos = get_global_id(0);
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
const int col = pos % wpl;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0;
else
nextword = *(sword + pos + 1);
//Loop unrolled
//1 bit to left and 1 bit to right
//Get the max value on LHS of every pixel
tempword = (prevword << (31)) | ((currword >> 1));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << 1) | (nextword >> (31));
destword |= tempword;
//2 bit to left and 2 bit to right
//Get the max value on LHS of every pixel
tempword = (prevword << (30)) | ((currword >> 2));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << 2) | (nextword >> (30));
destword |= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//2 words above
i = (row - 2) < 0 ? row : (row - 2);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//1 word above
i = (row - 1) < 0 ? row : (row - 1);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//1 word below
i = (row >= (h - 1)) ? row : (row + 1);
tempword = *(sword + i*wpl + col);
destword |= tempword;
//2 words below
i = (row >= (h - 2)) ? row : (row + 2);
tempword = *(sword + i*wpl + col);
destword |= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int parbitsxp, parbitsxn, nwords;
unsigned int destword, tempword, lastword, currword;
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
int i, j, siter, eiter;
//Ignore the execss
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
return;
currword = *(sword + pos);
destword = currword;
parbitsxp = xp & 31;
parbitsxn = xn & 31;
nwords = xp >> 5;
if (parbitsxp > 0)
nwords += 1;
else
parbitsxp = 31;
siter = (col - nwords);
eiter = (col + nwords);
//Get prev word
if (col==0)
firstword = 0x0;
else
firstword = *(sword + pos - 1);
//Get next word
if (col == (wpl - 1))
secondword = 0x0;
else
secondword = *(sword + pos + 1);
//Last partial bits on either side
for (i = 1; i <= parbitsxp; i++)
{
//Get the max value on LHS of every pixel
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << i) | (secondword >> (32 - i));
destword |= tempword;
}
//Return if halfwidth <= 1 word
if (nwords == 1)
{
if (xn == 32)
{
destword |= firstword;
}
if (xp == 32)
{
destword |= secondword;
}
*(dword + pos) = destword;
return;
}
if (siter < 0)
firstword = 0x0;
else
firstword = *(sword + row*wpl + siter);
if (eiter >= wpl)
lastword = 0x0;
else
lastword = *(sword + row*wpl + eiter);
for ( i = 1; i < nwords; i++)
{
//Gets LHS words
if ((siter + i) < 0)
secondword = 0x0;
else
secondword = *(sword + row*wpl + siter + i);
lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
firstword = secondword;
if ((siter + i + 1) < 0)
secondword = 0x0;
else
secondword = *(sword + row*wpl + siter + i + 1);
lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
//Gets RHS words
if ((eiter - i) >= wpl)
firstword = 0x0;
else
firstword = *(sword + row*wpl + eiter - i);
rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
lastword = firstword;
if ((eiter - i - 1) >= wpl)
firstword = 0x0;
else
firstword = *(sword + row*wpl + eiter - i - 1);
rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
for (j = 1; j < 32; j++)
{
//OR LHS full words
tempword = (lprevword << j) | (lnextword >> (32 - j));
destword |= tempword;
//OR RHS full words
tempword = (rprevword << j) | (rnextword >> (32 - j));
destword |= tempword;
}
destword |= lprevword;
destword |= lnextword;
destword |= rprevword;
destword |= rnextword;
lastword = firstword;
firstword = secondword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
const int halfwidth,
const int wpl, const int h,
const char isEven)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
int i;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0;
else
nextword = *(sword + pos + 1);
for (i = 1; i <= halfwidth; i++)
{
//Get the max value on LHS of every pixel
if (i == halfwidth && isEven)
{
tempword = 0x0;
}
else
{
tempword = (prevword << (32-i)) | ((currword >> i));
}
destword |= tempword;
//Get max value on RHS of every pixel
tempword = (currword << i) | (nextword >> (32 - i));
destword |= tempword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
const int yp,
const int wpl, const int h,
const int yn)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i, siter, eiter;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//Set start position and end position considering the boundary conditions
siter = (row - yn) < 0 ? 0 : (row - yn);
eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
for (i = siter; i <= eiter; i++)
{
tempword = *(sword + i*wpl + col);
destword |= tempword;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
const int wpl, const int h)
{
const unsigned int pos = get_global_id(0);
unsigned int prevword, nextword, currword,tempword;
unsigned int destword;
const int col = pos % wpl;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0xffffffff;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0xffffffff;
else
nextword = *(sword + pos + 1);
//Loop unrolled
//1 bit to left and 1 bit to right
//Get the min value on LHS of every pixel
tempword = (prevword << (31)) | ((currword >> 1));
destword &= tempword;
//Get min value on RHS of every pixel
tempword = (currword << 1) | (nextword >> (31));
destword &= tempword;
//2 bit to left and 2 bit to right
//Get the min value on LHS of every pixel
tempword = (prevword << (30)) | ((currword >> 2));
destword &= tempword;
//Get min value on RHS of every pixel
tempword = (currword << 2) | (nextword >> (30));
destword &= tempword;
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
const int wpl, const int h,
const int fwmask, const int lwmask)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword;
unsigned int destword;
int i;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
if (row < 2 || row >= (h - 2))
{
destword = 0x0;
}
else
{
//2 words above
//i = (row - 2) < 0 ? row : (row - 2);
i = (row - 2);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//1 word above
//i = (row - 1) < 0 ? row : (row - 1);
i = (row - 1);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//1 word below
//i = (row >= (h - 1)) ? row : (row + 1);
i = (row + 1);
tempword = *(sword + i*wpl + col);
destword &= tempword;
//2 words below
//i = (row >= (h - 2)) ? row : (row + 2);
i = (row + 2);
tempword = *(sword + i*wpl + col);
destword &= tempword;
if (col == 0)
{
destword &= fwmask;
}
if (col == (wpl - 1))
{
destword &= lwmask;
}
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
const int h, const char isAsymmetric, const int rwmask, const int lwmask)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int parbitsxp, parbitsxn, nwords;
unsigned int destword, tempword, lastword, currword;
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
int i, j, siter, eiter;
//Ignore the execss
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
return;
currword = *(sword + pos);
destword = currword;
parbitsxp = xp & 31;
parbitsxn = xn & 31;
nwords = xp >> 5;
if (parbitsxp > 0)
nwords += 1;
else
parbitsxp = 31;
siter = (col - nwords);
eiter = (col + nwords);
//Get prev word
if (col==0)
firstword = 0xffffffff;
else
firstword = *(sword + pos - 1);
//Get next word
if (col == (wpl - 1))
secondword = 0xffffffff;
else
secondword = *(sword + pos + 1);
//Last partial bits on either side
for (i = 1; i <= parbitsxp; i++)
{
//Get the max value on LHS of every pixel
tempword = (firstword << (32-i)) | ((currword >> i));
destword &= tempword;
//Get max value on RHS of every pixel
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
//tempword = (currword << i) | (secondword >> (32 - i));
destword &= tempword;
}
//Return if halfwidth <= 1 word
if (nwords == 1)
{
if (xp == 32)
{
destword &= firstword;
}
if (xn == 32)
{
destword &= secondword;
}
//Clear boundary pixels
if (isAsymmetric)
{
if (col == 0)
destword &= rwmask;
if (col == (wpl - 1))
destword &= lwmask;
}
*(dword + pos) = destword;
return;
}
if (siter < 0)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + siter);
if (eiter >= wpl)
lastword = 0xffffffff;
else
lastword = *(sword + row*wpl + eiter);
for ( i = 1; i < nwords; i++)
{
//Gets LHS words
if ((siter + i) < 0)
secondword = 0xffffffff;
else
secondword = *(sword + row*wpl + siter + i);
lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
firstword = secondword;
if ((siter + i + 1) < 0)
secondword = 0xffffffff;
else
secondword = *(sword + row*wpl + siter + i + 1);
lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
//Gets RHS words
if ((eiter - i) >= wpl)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + eiter - i);
rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
lastword = firstword;
if ((eiter - i - 1) >= wpl)
firstword = 0xffffffff;
else
firstword = *(sword + row*wpl + eiter - i - 1);
rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
for (j = 0; j < 32; j++)
{
//OR LHS full words
tempword = (lprevword << j) | (lnextword >> (32 - j));
destword &= tempword;
//OR RHS full words
tempword = (rprevword << j) | (rnextword >> (32 - j));
destword &= tempword;
}
destword &= lprevword;
destword &= lnextword;
destword &= rprevword;
destword &= rnextword;
lastword = firstword;
firstword = secondword;
}
if (isAsymmetric)
{
//Clear boundary pixels
if (col < (nwords - 1))
destword = 0x0;
else if (col == (nwords - 1))
destword &= rwmask;
else if (col > (wpl - nwords))
destword = 0x0;
else if (col == (wpl - nwords))
destword &= lwmask;
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
const int halfwidth, const int wpl,
const int h, const char clearBoundPixH,
const int rwmask, const int lwmask,
const char isEven)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int prevword, nextword, currword,tempword, destword;
int i;
//Ignore the execss
if (pos >= (wpl * h))
return;
currword = *(sword + pos);
destword = currword;
//Handle boundary conditions
if(col==0)
prevword=0xffffffff;
else
prevword = *(sword + pos - 1);
if(col==(wpl - 1))
nextword=0xffffffff;
else
nextword = *(sword + pos + 1);
for (i = 1; i <= halfwidth; i++)
{
//Get the min value on LHS of every pixel
tempword = (prevword << (32-i)) | ((currword >> i));
destword &= tempword;
//Get min value on RHS of every pixel
if (i == halfwidth && isEven)
{
tempword = 0xffffffff;
}
else
{
tempword = (currword << i) | (nextword >> (32 - i));
}
destword &= tempword;
}
if (clearBoundPixH)
{
if (col == 0)
{
destword &= rwmask;
}
else if (col == (wpl - 1))
{
destword &= lwmask;
}
}
*(dword + pos) = destword;
}\n
)
KERNEL(
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
const int yp,
const int wpl, const int h,
const char clearBoundPixV, const int yn)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
const unsigned int pos = row * wpl + col;
unsigned int tempword, destword;
int i, siter, eiter;
//Ignore the execss
if (row >= h || col >= wpl)
return;
destword = *(sword + pos);
//Set start position and end position considering the boundary conditions
siter = (row - yp) < 0 ? 0 : (row - yp);
eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
for (i = siter; i <= eiter; i++)
{
tempword = *(sword + i*wpl + col);
destword &= tempword;
}
//Clear boundary pixels
if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
{
destword = 0x0;
}
*(dword + pos) = destword;
}\n
)
// HistogramRect Kernel: Accumulate
// assumes 4 channels, i.e., bytes_per_pixel = 4
// assumes number of pixels is multiple of 8
// data is layed out as
// ch0 ch1 ...
// bin0 bin1 bin2... bin0...
// rpt0,1,2...256 rpt0,1,2...
KERNEL(
\n#define HIST_REDUNDANCY 256\n
\n#define GROUP_SIZE 256\n
\n#define HIST_SIZE 256\n
\n#define NUM_CHANNELS 4\n
\n#define HR_UNROLL_SIZE 8 \n
\n#define HR_UNROLL_TYPE uchar8 \n
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannels(
__global const uchar8 *data,
uint numPixels,
__global uint *histBuffer) {
// declare variables
uchar8 pixels;
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
// for each pixel/channel, accumulate in global memory
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
pixels = data[pc];
// channel bin thread
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
}
}
)
KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannel(
__global const uchar8 *data,
uint numPixels,
__global uint *histBuffer) {
// declare variables
uchar8 pixels;
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
// for each pixel/channel, accumulate in global memory
for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
pixels = data[pc];
// bin thread
atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
}
}
)
KERNEL(
// unused
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannels_Grey(
\n __global const uchar* data,
\n uint numPixels,
\n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
\n
\n /* declare variables */
\n
\n // work indices
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n
\n /* accumulate in global memory */
\n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
\n uchar value = data[ pc ];
\n int idx = value * get_global_size(0) + get_global_id(0);
\n histBuffer[ idx ]++;
\n
\n }
\n
\n } // kernel_HistogramRectAllChannels_Grey
)
// HistogramRect Kernel: Reduction
// only supports 4 channels
// each work group handles a single channel of a single histogram bin
KERNEL(
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannelsReduction(
int n, // unused pixel redundancy
__global uint *histBuffer,
__global int* histResult) {
// declare variables
int channel = get_group_id(0)/HIST_SIZE;
int bin = get_group_id(0)%HIST_SIZE;
int value = 0;
// accumulate in register
for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
}
// reduction in local memory
__local int localHist[GROUP_SIZE];
localHist[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
if (get_local_id(0) < stride) {
value = localHist[ get_local_id(0)+stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride) {
localHist[ get_local_id(0)] += value;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write reduction to final result
if (get_local_id(0) == 0) {
histResult[get_group_id(0)] = localHist[0];
}
} // kernel_HistogramRectAllChannels
)
KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannelReduction(
int n, // unused pixel redundancy
__global uint *histBuffer,
__global int* histResult) {
// declare variables
// int channel = get_group_id(0)/HIST_SIZE;
int bin = get_group_id(0)%HIST_SIZE;
int value = 0;
// accumulate in register
for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
value += histBuffer[ bin*HIST_REDUNDANCY+i];
}
// reduction in local memory
__local int localHist[GROUP_SIZE];
localHist[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
if (get_local_id(0) < stride) {
value = localHist[ get_local_id(0)+stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride) {
localHist[ get_local_id(0)] += value;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write reduction to final result
if (get_local_id(0) == 0) {
histResult[get_group_id(0)] = localHist[0];
}
} // kernel_HistogramRectOneChannelReduction
)
KERNEL(
// unused
// each work group (x256) handles a histogram bin
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannelsReduction_Grey(
\n int n, // pixel redundancy that needs to be accumulated
\n __global uint *histBuffer,
\n __global uint* histResult) { // each wg accumulates 1 bin
\n
\n /* declare variables */
\n
\n // work indices
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n unsigned int hist = 0;
\n
\n /* accumulate in global memory */
\n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
\n hist += histBuffer[ (get_group_id(0)*n + p)];
\n }
\n
\n /* reduction in local memory */
\n // populate local memory
\n __local unsigned int localHist[GROUP_SIZE];
\n localHist[localId] = hist;
\n barrier(CLK_LOCAL_MEM_FENCE);
\n
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n if (localId < stride) {
\n hist = localHist[ (localId+stride)];
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n if (localId < stride) {
\n localHist[ localId] += hist;
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n }
\n
\n if (localId == 0)
\n histResult[get_group_id(0)] = localHist[0];
\n
\n } // kernel_HistogramRectAllChannelsReduction_Grey
)
// ThresholdRectToPix Kernel
// only supports 4 channels
// imageData is input image (24-bits/pixel)
// pix is output image (1-bit/pixel)
KERNEL(
\n#define CHAR_VEC_WIDTH 8 \n
\n#define PIXELS_PER_WORD 32 \n
\n#define PIXELS_PER_BURST 8 \n
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
typedef union {
uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
} charVec;
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix(
__global const uchar8 *imageData,
int height,
int width,
int wpl, // words per line
__global int *thresholds,
__global int *hi_values,
__global int *pix) {
// declare variables
int pThresholds[NUM_CHANNELS];
int pHi_Values[NUM_CHANNELS];
for ( int i = 0; i < NUM_CHANNELS; i++) {
pThresholds[i] = thresholds[i];
pHi_Values[i] = hi_values[i];
}
// for each word (32 pixels) in output image
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
unsigned int word = 0; // all bits start at zero
// for each burst in word
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
// load burst
charVec pixels;
for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i];
}
// for each pixel in burst
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
for ( int c = 0; c < NUM_CHANNELS; c++) {
unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
}
}
}
}
pix[w] = word;
}
}
// only supports 1 channel
typedef union {
uchar s[PIXELS_PER_BURST];
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
} charVec1;
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix_OneChan(
__global const uchar8 *imageData,
int height,
int width,
int wpl, // words per line
__global int *thresholds,
__global int *hi_values,
__global int *pix) {
// declare variables
int pThresholds[1];
int pHi_Values[1];
for ( int i = 0; i < 1; i++) {
pThresholds[i] = thresholds[i];
pHi_Values[i] = hi_values[i];
}
// for each word (32 pixels) in output image
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
unsigned int word = 0; // all bits start at zero
// for each burst in word
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
// load burst
charVec1 pixels;
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
}
// for each pixel in burst
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
for ( int c = 0; c < 1; c++) {
unsigned char pixChan = pixels.s[p + c];
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
}
}
}
}
pix[w] = word;
}
}
)
; // close char*
#endif // USE_EXTERNAL_KERNEL
#endif //_OCL_KERNEL_H_
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
// Alternative histogram kernel written to use uchar and different global memory scattered write
// was a little better for intel platforms but still not faster then native serial code
#if 0
/* data layed out as
bin0 bin1 bin2...
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
*/
\n__attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannels_uchar(
\n volatile __global const uchar *data,
\n uint numPixels,
\n volatile __global uint *histBuffer) {
\n
\n // for each pixel/channel, accumulate in global memory
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
\n uchar value = data[pc];
\n int idx = value*get_global_size(0) + get_global_id(0);
\n histBuffer[ idx ]++; // coalesced if same value
\n }
\n } // kernel_HistogramRectAllChannels
\n
\n __attribute__((reqd_work_group_size(256, 1, 1)))
\n __kernel
\n void kernel_HistogramRectAllChannelsReduction_uchar(
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
\n __global uint4 *histBuffer,
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
\n
\n // declare variables
\n int binIdx = get_group_id(0);
\n size_t groupId = get_group_id(0);
\n size_t localId = get_local_id(0); // 0 -> 256-1
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n uint numThreads = get_global_size(0);
\n uint4 hist = {0, 0, 0, 0};
\n
\n // accumulate in register
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
\n hist += histBuffer[binIdx*n+p];
\n }
\n
\n // reduction in local memory
\n __local uint4 localHist[GROUP_SIZE];
\n localHist[localId] = hist;
\n barrier(CLK_LOCAL_MEM_FENCE);
\n
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n if (localId < stride) {
\n hist = localHist[ localId+stride];
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n if (localId < stride) {
\n localHist[ localId] += hist;
\n }
\n barrier(CLK_LOCAL_MEM_FENCE);
\n }
\n
\n // write reduction to final result
\n if (localId == 0) {
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
\n }
\n
\n } // kernel_HistogramRectAllChannels
#endif