2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
#ifndef _OCL_KERNEL_H_
|
|
|
|
#define _OCL_KERNEL_H_
|
|
|
|
#ifndef USE_EXTERNAL_KERNEL
|
|
|
|
#define KERNEL( ... )# __VA_ARGS__ "\n"
|
|
|
|
// Double precision is a default of spreadsheets
|
|
|
|
// cl_khr_fp64: Khronos extension
|
|
|
|
// cl_amd_fp64: AMD extension
|
|
|
|
// use build option outside to define fp_t
|
|
|
|
/////////////////////////////////////////////
|
|
|
|
const char *kernel_src = KERNEL(
|
|
|
|
\n#ifdef KHR_DP_EXTENSION\n
|
|
|
|
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
|
|
|
|
\n#elif AMD_DP_EXTENSION\n
|
|
|
|
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
|
|
|
|
\n#else\n
|
|
|
|
\n#endif\n
|
|
|
|
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
|
|
|
|
{
|
|
|
|
int i = get_global_id(1);
|
2014-01-18 02:51:27 +08:00
|
|
|
int j = get_global_id(0);
|
|
|
|
int tiffword,rval,gval,bval;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
//Ignore the excess
|
|
|
|
if ((i >= h) || (j >= w))
|
|
|
|
return;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
tiffword = tiffdata[i * w + j];
|
2013-11-12 01:43:13 +08:00
|
|
|
rval = ((tiffword) & 0xff);
|
|
|
|
gval = (((tiffword) >> 8) & 0xff);
|
|
|
|
bval = (((tiffword) >> 16) & 0xff);
|
2014-01-18 02:51:27 +08:00
|
|
|
output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
|
2013-11-12 01:43:13 +08:00
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int row = get_global_id(1);
|
|
|
|
const unsigned int col = get_global_id(0);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
*(dword + pos) &= ~(*(sword + pos));
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void pixSubtract(__global int *dword, __global int *sword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h, __global int *outword)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int row = get_global_id(1);
|
|
|
|
const unsigned int col = get_global_id(0);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
*(outword + pos) = *(dword + pos) & ~(*(sword + pos));
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int row = get_global_id(1);
|
|
|
|
const unsigned int col = get_global_id(0);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
*(outword + pos) = *(dword + pos) & (*(sword + pos));
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int row = get_global_id(1);
|
|
|
|
const unsigned int col = get_global_id(0);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
2014-01-18 02:51:27 +08:00
|
|
|
*(outword + pos) = *(dword + pos) | (*(sword + pos));
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int pos = get_global_id(0);
|
|
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
const int col = pos % wpl;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl * h))
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
//Handle boundary conditions
|
|
|
|
if(col==0)
|
|
|
|
prevword=0;
|
|
|
|
else
|
|
|
|
prevword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
if(col==(wpl - 1))
|
|
|
|
nextword=0;
|
|
|
|
else
|
|
|
|
nextword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
//Loop unrolled
|
|
|
|
|
|
|
|
//1 bit to left and 1 bit to right
|
|
|
|
//Get the max value on LHS of every pixel
|
|
|
|
tempword = (prevword << (31)) | ((currword >> 1));
|
|
|
|
destword |= tempword;
|
|
|
|
//Get max value on RHS of every pixel
|
|
|
|
tempword = (currword << 1) | (nextword >> (31));
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//2 bit to left and 2 bit to right
|
|
|
|
//Get the max value on LHS of every pixel
|
|
|
|
tempword = (prevword << (30)) | ((currword >> 2));
|
|
|
|
destword |= tempword;
|
|
|
|
//Get max value on RHS of every pixel
|
|
|
|
tempword = (currword << 2) | (nextword >> (30));
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
|
|
|
|
|
|
|
destword = *(sword + pos);
|
|
|
|
|
|
|
|
//2 words above
|
|
|
|
i = (row - 2) < 0 ? row : (row - 2);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//1 word above
|
|
|
|
i = (row - 1) < 0 ? row : (row - 1);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//1 word below
|
|
|
|
i = (row >= (h - 1)) ? row : (row + 1);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//2 words below
|
|
|
|
i = (row >= (h - 2)) ? row : (row + 2);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
|
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int parbitsxp, parbitsxn, nwords;
|
|
|
|
unsigned int destword, tempword, lastword, currword;
|
|
|
|
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
|
|
|
|
int i, j, siter, eiter;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
parbitsxp = xp & 31;
|
|
|
|
parbitsxn = xn & 31;
|
|
|
|
nwords = xp >> 5;
|
|
|
|
|
|
|
|
if (parbitsxp > 0)
|
|
|
|
nwords += 1;
|
|
|
|
else
|
|
|
|
parbitsxp = 31;
|
|
|
|
|
|
|
|
siter = (col - nwords);
|
|
|
|
eiter = (col + nwords);
|
|
|
|
|
|
|
|
//Get prev word
|
|
|
|
if (col==0)
|
|
|
|
firstword = 0x0;
|
|
|
|
else
|
|
|
|
firstword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
//Get next word
|
|
|
|
if (col == (wpl - 1))
|
|
|
|
secondword = 0x0;
|
|
|
|
else
|
|
|
|
secondword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
//Last partial bits on either side
|
|
|
|
for (i = 1; i <= parbitsxp; i++)
|
|
|
|
{
|
|
|
|
//Get the max value on LHS of every pixel
|
|
|
|
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
|
|
|
|
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//Get max value on RHS of every pixel
|
|
|
|
tempword = (currword << i) | (secondword >> (32 - i));
|
|
|
|
destword |= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
//Return if halfwidth <= 1 word
|
|
|
|
if (nwords == 1)
|
|
|
|
{
|
|
|
|
if (xn == 32)
|
|
|
|
{
|
|
|
|
destword |= firstword;
|
|
|
|
}
|
|
|
|
if (xp == 32)
|
|
|
|
{
|
|
|
|
destword |= secondword;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (siter < 0)
|
|
|
|
firstword = 0x0;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + siter);
|
|
|
|
|
|
|
|
if (eiter >= wpl)
|
|
|
|
lastword = 0x0;
|
|
|
|
else
|
|
|
|
lastword = *(sword + row*wpl + eiter);
|
|
|
|
|
|
|
|
for ( i = 1; i < nwords; i++)
|
|
|
|
{
|
|
|
|
//Gets LHS words
|
|
|
|
if ((siter + i) < 0)
|
|
|
|
secondword = 0x0;
|
|
|
|
else
|
|
|
|
secondword = *(sword + row*wpl + siter + i);
|
|
|
|
|
|
|
|
lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
|
|
|
|
|
|
|
|
firstword = secondword;
|
|
|
|
|
|
|
|
if ((siter + i + 1) < 0)
|
|
|
|
secondword = 0x0;
|
|
|
|
else
|
|
|
|
secondword = *(sword + row*wpl + siter + i + 1);
|
|
|
|
|
|
|
|
lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
|
|
|
|
|
|
|
|
//Gets RHS words
|
|
|
|
if ((eiter - i) >= wpl)
|
|
|
|
firstword = 0x0;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + eiter - i);
|
|
|
|
|
|
|
|
rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
|
|
|
|
|
|
|
|
lastword = firstword;
|
|
|
|
if ((eiter - i - 1) >= wpl)
|
|
|
|
firstword = 0x0;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + eiter - i - 1);
|
|
|
|
|
|
|
|
rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
|
|
|
|
|
|
|
|
for (j = 1; j < 32; j++)
|
|
|
|
{
|
|
|
|
//OR LHS full words
|
|
|
|
tempword = (lprevword << j) | (lnextword >> (32 - j));
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//OR RHS full words
|
|
|
|
tempword = (rprevword << j) | (rnextword >> (32 - j));
|
|
|
|
destword |= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
destword |= lprevword;
|
|
|
|
destword |= lnextword;
|
|
|
|
destword |= rprevword;
|
|
|
|
destword |= rnextword;
|
|
|
|
|
|
|
|
lastword = firstword;
|
|
|
|
firstword = secondword;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int halfwidth,
|
|
|
|
const int wpl, const int h,
|
|
|
|
const char isEven)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl * h))
|
|
|
|
return;
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
//Handle boundary conditions
|
|
|
|
if(col==0)
|
|
|
|
prevword=0;
|
|
|
|
else
|
|
|
|
prevword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
if(col==(wpl - 1))
|
|
|
|
nextword=0;
|
|
|
|
else
|
|
|
|
nextword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
for (i = 1; i <= halfwidth; i++)
|
|
|
|
{
|
|
|
|
//Get the max value on LHS of every pixel
|
|
|
|
if (i == halfwidth && isEven)
|
|
|
|
{
|
|
|
|
tempword = 0x0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
tempword = (prevword << (32-i)) | ((currword >> i));
|
|
|
|
}
|
|
|
|
|
|
|
|
destword |= tempword;
|
|
|
|
|
|
|
|
//Get max value on RHS of every pixel
|
|
|
|
tempword = (currword << i) | (nextword >> (32 - i));
|
|
|
|
|
|
|
|
destword |= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int yp,
|
|
|
|
const int wpl, const int h,
|
|
|
|
const int yn)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
int i, siter, eiter;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
|
|
|
|
|
|
|
destword = *(sword + pos);
|
|
|
|
|
|
|
|
//Set start position and end position considering the boundary conditions
|
|
|
|
siter = (row - yn) < 0 ? 0 : (row - yn);
|
|
|
|
eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
|
|
|
|
|
|
|
|
for (i = siter; i <= eiter; i++)
|
|
|
|
{
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
|
|
|
|
destword |= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const unsigned int pos = get_global_id(0);
|
|
|
|
unsigned int prevword, nextword, currword,tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
const int col = pos % wpl;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl * h))
|
|
|
|
return;
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
//Handle boundary conditions
|
|
|
|
if(col==0)
|
|
|
|
prevword=0xffffffff;
|
|
|
|
else
|
|
|
|
prevword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
if(col==(wpl - 1))
|
|
|
|
nextword=0xffffffff;
|
|
|
|
else
|
|
|
|
nextword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
//Loop unrolled
|
|
|
|
|
|
|
|
//1 bit to left and 1 bit to right
|
|
|
|
//Get the min value on LHS of every pixel
|
|
|
|
tempword = (prevword << (31)) | ((currword >> 1));
|
|
|
|
destword &= tempword;
|
|
|
|
//Get min value on RHS of every pixel
|
|
|
|
tempword = (currword << 1) | (nextword >> (31));
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//2 bit to left and 2 bit to right
|
|
|
|
//Get the min value on LHS of every pixel
|
|
|
|
tempword = (prevword << (30)) | ((currword >> 2));
|
|
|
|
destword &= tempword;
|
|
|
|
//Get min value on RHS of every pixel
|
|
|
|
tempword = (currword << 2) | (nextword >> (30));
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int wpl, const int h,
|
|
|
|
const int fwmask, const int lwmask)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int tempword;
|
|
|
|
unsigned int destword;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
|
|
|
|
|
|
|
destword = *(sword + pos);
|
|
|
|
|
|
|
|
if (row < 2 || row >= (h - 2))
|
|
|
|
{
|
|
|
|
destword = 0x0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
//2 words above
|
|
|
|
//i = (row - 2) < 0 ? row : (row - 2);
|
|
|
|
i = (row - 2);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//1 word above
|
|
|
|
//i = (row - 1) < 0 ? row : (row - 1);
|
|
|
|
i = (row - 1);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//1 word below
|
|
|
|
//i = (row >= (h - 1)) ? row : (row + 1);
|
|
|
|
i = (row + 1);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//2 words below
|
|
|
|
//i = (row >= (h - 2)) ? row : (row + 2);
|
|
|
|
i = (row + 2);
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
if (col == 0)
|
|
|
|
{
|
|
|
|
destword &= fwmask;
|
|
|
|
}
|
|
|
|
if (col == (wpl - 1))
|
|
|
|
{
|
|
|
|
destword &= lwmask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
2014-01-18 02:51:27 +08:00
|
|
|
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
|
|
|
|
const int h, const char isAsymmetric, const int rwmask, const int lwmask)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int parbitsxp, parbitsxn, nwords;
|
|
|
|
unsigned int destword, tempword, lastword, currword;
|
|
|
|
unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
|
|
|
|
int i, j, siter, eiter;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl*h) || (xn < 1 && xp < 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
parbitsxp = xp & 31;
|
|
|
|
parbitsxn = xn & 31;
|
|
|
|
nwords = xp >> 5;
|
|
|
|
|
|
|
|
if (parbitsxp > 0)
|
|
|
|
nwords += 1;
|
|
|
|
else
|
|
|
|
parbitsxp = 31;
|
|
|
|
|
|
|
|
siter = (col - nwords);
|
|
|
|
eiter = (col + nwords);
|
|
|
|
|
|
|
|
//Get prev word
|
|
|
|
if (col==0)
|
|
|
|
firstword = 0xffffffff;
|
|
|
|
else
|
|
|
|
firstword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
//Get next word
|
|
|
|
if (col == (wpl - 1))
|
|
|
|
secondword = 0xffffffff;
|
|
|
|
else
|
|
|
|
secondword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
//Last partial bits on either side
|
|
|
|
for (i = 1; i <= parbitsxp; i++)
|
|
|
|
{
|
|
|
|
//Get the max value on LHS of every pixel
|
|
|
|
tempword = (firstword << (32-i)) | ((currword >> i));
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//Get max value on RHS of every pixel
|
|
|
|
tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
|
|
|
|
|
|
|
|
//tempword = (currword << i) | (secondword >> (32 - i));
|
|
|
|
destword &= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
//Return if halfwidth <= 1 word
|
|
|
|
if (nwords == 1)
|
|
|
|
{
|
|
|
|
if (xp == 32)
|
|
|
|
{
|
|
|
|
destword &= firstword;
|
|
|
|
}
|
|
|
|
if (xn == 32)
|
|
|
|
{
|
|
|
|
destword &= secondword;
|
|
|
|
}
|
|
|
|
|
|
|
|
//Clear boundary pixels
|
|
|
|
if (isAsymmetric)
|
|
|
|
{
|
|
|
|
if (col == 0)
|
|
|
|
destword &= rwmask;
|
|
|
|
if (col == (wpl - 1))
|
|
|
|
destword &= lwmask;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (siter < 0)
|
|
|
|
firstword = 0xffffffff;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + siter);
|
|
|
|
|
|
|
|
if (eiter >= wpl)
|
|
|
|
lastword = 0xffffffff;
|
|
|
|
else
|
|
|
|
lastword = *(sword + row*wpl + eiter);
|
|
|
|
|
|
|
|
|
|
|
|
for ( i = 1; i < nwords; i++)
|
|
|
|
{
|
|
|
|
//Gets LHS words
|
|
|
|
if ((siter + i) < 0)
|
|
|
|
secondword = 0xffffffff;
|
|
|
|
else
|
|
|
|
secondword = *(sword + row*wpl + siter + i);
|
|
|
|
|
|
|
|
lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
|
|
|
|
|
|
|
|
firstword = secondword;
|
|
|
|
|
|
|
|
if ((siter + i + 1) < 0)
|
|
|
|
secondword = 0xffffffff;
|
|
|
|
else
|
|
|
|
secondword = *(sword + row*wpl + siter + i + 1);
|
|
|
|
|
|
|
|
lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
|
|
|
|
|
|
|
|
//Gets RHS words
|
|
|
|
if ((eiter - i) >= wpl)
|
|
|
|
firstword = 0xffffffff;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + eiter - i);
|
|
|
|
|
|
|
|
rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
|
|
|
|
|
|
|
|
lastword = firstword;
|
|
|
|
if ((eiter - i - 1) >= wpl)
|
|
|
|
firstword = 0xffffffff;
|
|
|
|
else
|
|
|
|
firstword = *(sword + row*wpl + eiter - i - 1);
|
|
|
|
|
|
|
|
rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
|
|
|
|
|
|
|
|
for (j = 0; j < 32; j++)
|
|
|
|
{
|
|
|
|
//OR LHS full words
|
|
|
|
tempword = (lprevword << j) | (lnextword >> (32 - j));
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//OR RHS full words
|
|
|
|
tempword = (rprevword << j) | (rnextword >> (32 - j));
|
|
|
|
destword &= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
destword &= lprevword;
|
|
|
|
destword &= lnextword;
|
|
|
|
destword &= rprevword;
|
|
|
|
destword &= rnextword;
|
|
|
|
|
|
|
|
lastword = firstword;
|
|
|
|
firstword = secondword;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isAsymmetric)
|
|
|
|
{
|
|
|
|
//Clear boundary pixels
|
|
|
|
if (col < (nwords - 1))
|
|
|
|
destword = 0x0;
|
|
|
|
else if (col == (nwords - 1))
|
|
|
|
destword &= rwmask;
|
|
|
|
else if (col > (wpl - nwords))
|
|
|
|
destword = 0x0;
|
|
|
|
else if (col == (wpl - nwords))
|
|
|
|
destword &= lwmask;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int halfwidth, const int wpl,
|
|
|
|
const int h, const char clearBoundPixH,
|
|
|
|
const int rwmask, const int lwmask,
|
|
|
|
const char isEven)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int prevword, nextword, currword,tempword, destword;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (pos >= (wpl * h))
|
|
|
|
return;
|
|
|
|
|
|
|
|
currword = *(sword + pos);
|
|
|
|
destword = currword;
|
|
|
|
|
|
|
|
//Handle boundary conditions
|
|
|
|
if(col==0)
|
|
|
|
prevword=0xffffffff;
|
|
|
|
else
|
|
|
|
prevword = *(sword + pos - 1);
|
|
|
|
|
|
|
|
if(col==(wpl - 1))
|
|
|
|
nextword=0xffffffff;
|
|
|
|
else
|
|
|
|
nextword = *(sword + pos + 1);
|
|
|
|
|
|
|
|
for (i = 1; i <= halfwidth; i++)
|
|
|
|
{
|
|
|
|
//Get the min value on LHS of every pixel
|
|
|
|
tempword = (prevword << (32-i)) | ((currword >> i));
|
|
|
|
|
|
|
|
destword &= tempword;
|
|
|
|
|
|
|
|
//Get min value on RHS of every pixel
|
|
|
|
if (i == halfwidth && isEven)
|
|
|
|
{
|
|
|
|
tempword = 0xffffffff;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
tempword = (currword << i) | (nextword >> (32 - i));
|
|
|
|
}
|
|
|
|
|
|
|
|
destword &= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (clearBoundPixH)
|
|
|
|
{
|
|
|
|
if (col == 0)
|
|
|
|
{
|
|
|
|
destword &= rwmask;
|
|
|
|
}
|
|
|
|
else if (col == (wpl - 1))
|
|
|
|
{
|
|
|
|
destword &= lwmask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
|
2014-01-18 02:51:27 +08:00
|
|
|
const int yp,
|
|
|
|
const int wpl, const int h,
|
|
|
|
const char clearBoundPixV, const int yn)
|
2013-11-12 01:43:13 +08:00
|
|
|
{
|
2014-01-18 02:51:27 +08:00
|
|
|
const int col = get_global_id(0);
|
|
|
|
const int row = get_global_id(1);
|
|
|
|
const unsigned int pos = row * wpl + col;
|
|
|
|
unsigned int tempword, destword;
|
|
|
|
int i, siter, eiter;
|
|
|
|
|
|
|
|
//Ignore the execss
|
|
|
|
if (row >= h || col >= wpl)
|
|
|
|
return;
|
|
|
|
|
|
|
|
destword = *(sword + pos);
|
|
|
|
|
|
|
|
//Set start position and end position considering the boundary conditions
|
|
|
|
siter = (row - yp) < 0 ? 0 : (row - yp);
|
|
|
|
eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
|
|
|
|
|
|
|
|
for (i = siter; i <= eiter; i++)
|
|
|
|
{
|
|
|
|
tempword = *(sword + i*wpl + col);
|
|
|
|
|
|
|
|
destword &= tempword;
|
|
|
|
}
|
|
|
|
|
|
|
|
//Clear boundary pixels
|
|
|
|
if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
|
|
|
|
{
|
|
|
|
destword = 0x0;
|
|
|
|
}
|
|
|
|
|
|
|
|
*(dword + pos) = destword;
|
2013-11-12 01:43:13 +08:00
|
|
|
}\n
|
|
|
|
)
|
|
|
|
|
|
|
|
// HistogramRect Kernel: Accumulate
|
|
|
|
// assumes 4 channels, i.e., bytes_per_pixel = 4
|
|
|
|
// assumes number of pixels is multiple of 8
|
|
|
|
// data is layed out as
|
|
|
|
// ch0 ch1 ...
|
|
|
|
// bin0 bin1 bin2... bin0...
|
|
|
|
// rpt0,1,2...256 rpt0,1,2...
|
|
|
|
KERNEL(
|
|
|
|
\n#define HIST_REDUNDANCY 256\n
|
|
|
|
\n#define GROUP_SIZE 256\n
|
|
|
|
\n#define HIST_SIZE 256\n
|
|
|
|
\n#define NUM_CHANNELS 4\n
|
|
|
|
\n#define HR_UNROLL_SIZE 8 \n
|
|
|
|
\n#define HR_UNROLL_TYPE uchar8 \n
|
|
|
|
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_HistogramRectAllChannels(
|
|
|
|
__global const uchar8 *data,
|
|
|
|
uint numPixels,
|
2014-01-18 02:51:27 +08:00
|
|
|
__global uint *histBuffer) {
|
2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
// declare variables
|
|
|
|
uchar8 pixels;
|
|
|
|
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
|
|
|
|
|
|
|
|
// for each pixel/channel, accumulate in global memory
|
|
|
|
for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
|
|
|
|
pixels = data[pc];
|
|
|
|
// channel bin thread
|
|
|
|
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
|
|
|
|
atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
|
|
|
|
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
|
|
|
|
atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
|
|
|
|
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
|
|
|
|
atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
|
|
|
|
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
|
|
|
|
atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
|
|
|
|
}
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2013-12-10 18:52:54 +08:00
|
|
|
KERNEL(
|
|
|
|
// NUM_CHANNELS = 1
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_HistogramRectOneChannel(
|
|
|
|
__global const uchar8 *data,
|
|
|
|
uint numPixels,
|
|
|
|
__global uint *histBuffer) {
|
|
|
|
|
|
|
|
// declare variables
|
|
|
|
uchar8 pixels;
|
|
|
|
int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
|
|
|
|
|
|
|
|
// for each pixel/channel, accumulate in global memory
|
|
|
|
for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
|
|
|
|
pixels = data[pc];
|
|
|
|
// bin thread
|
|
|
|
atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
// unused
|
|
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
\n __kernel
|
|
|
|
\n void kernel_HistogramRectAllChannels_Grey(
|
|
|
|
\n __global const uchar* data,
|
|
|
|
\n uint numPixels,
|
|
|
|
\n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
|
|
|
|
\n
|
|
|
|
\n /* declare variables */
|
|
|
|
\n
|
|
|
|
\n // work indices
|
|
|
|
\n size_t groupId = get_group_id(0);
|
|
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
|
|
\n uint numThreads = get_global_size(0);
|
|
|
|
\n
|
|
|
|
\n /* accumulate in global memory */
|
|
|
|
\n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
|
|
|
|
\n uchar value = data[ pc ];
|
|
|
|
\n int idx = value * get_global_size(0) + get_global_id(0);
|
|
|
|
\n histBuffer[ idx ]++;
|
|
|
|
\n
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n } // kernel_HistogramRectAllChannels_Grey
|
|
|
|
|
|
|
|
)
|
|
|
|
|
2013-11-12 01:43:13 +08:00
|
|
|
// HistogramRect Kernel: Reduction
|
2013-12-10 18:52:54 +08:00
|
|
|
// only supports 4 channels
|
2013-11-12 01:43:13 +08:00
|
|
|
// each work group handles a single channel of a single histogram bin
|
|
|
|
KERNEL(
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_HistogramRectAllChannelsReduction(
|
|
|
|
int n, // unused pixel redundancy
|
2014-01-18 02:51:27 +08:00
|
|
|
__global uint *histBuffer,
|
2013-12-10 18:52:54 +08:00
|
|
|
__global int* histResult) {
|
2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
// declare variables
|
|
|
|
int channel = get_group_id(0)/HIST_SIZE;
|
|
|
|
int bin = get_group_id(0)%HIST_SIZE;
|
2013-12-10 18:52:54 +08:00
|
|
|
int value = 0;
|
2013-11-12 01:43:13 +08:00
|
|
|
|
|
|
|
// accumulate in register
|
|
|
|
for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
|
|
|
|
value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// reduction in local memory
|
2013-12-10 18:52:54 +08:00
|
|
|
__local int localHist[GROUP_SIZE];
|
2013-11-12 01:43:13 +08:00
|
|
|
localHist[get_local_id(0)] = value;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
|
|
if (get_local_id(0) < stride) {
|
|
|
|
value = localHist[ get_local_id(0)+stride];
|
|
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
if (get_local_id(0) < stride) {
|
|
|
|
localHist[ get_local_id(0)] += value;
|
|
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
}
|
|
|
|
|
|
|
|
// write reduction to final result
|
|
|
|
if (get_local_id(0) == 0) {
|
|
|
|
histResult[get_group_id(0)] = localHist[0];
|
|
|
|
}
|
|
|
|
} // kernel_HistogramRectAllChannels
|
|
|
|
)
|
|
|
|
|
2013-12-10 18:52:54 +08:00
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
// NUM_CHANNELS = 1
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_HistogramRectOneChannelReduction(
|
|
|
|
int n, // unused pixel redundancy
|
|
|
|
__global uint *histBuffer,
|
|
|
|
__global int* histResult) {
|
|
|
|
|
|
|
|
// declare variables
|
|
|
|
// int channel = get_group_id(0)/HIST_SIZE;
|
|
|
|
int bin = get_group_id(0)%HIST_SIZE;
|
|
|
|
int value = 0;
|
|
|
|
|
|
|
|
// accumulate in register
|
|
|
|
for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
|
|
|
|
value += histBuffer[ bin*HIST_REDUNDANCY+i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// reduction in local memory
|
|
|
|
__local int localHist[GROUP_SIZE];
|
|
|
|
localHist[get_local_id(0)] = value;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
|
|
if (get_local_id(0) < stride) {
|
|
|
|
value = localHist[ get_local_id(0)+stride];
|
|
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
if (get_local_id(0) < stride) {
|
|
|
|
localHist[ get_local_id(0)] += value;
|
|
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
}
|
|
|
|
|
|
|
|
// write reduction to final result
|
|
|
|
if (get_local_id(0) == 0) {
|
|
|
|
histResult[get_group_id(0)] = localHist[0];
|
|
|
|
}
|
|
|
|
} // kernel_HistogramRectOneChannelReduction
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
KERNEL(
|
|
|
|
// unused
|
|
|
|
// each work group (x256) handles a histogram bin
|
|
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
\n __kernel
|
|
|
|
\n void kernel_HistogramRectAllChannelsReduction_Grey(
|
|
|
|
\n int n, // pixel redundancy that needs to be accumulated
|
|
|
|
\n __global uint *histBuffer,
|
|
|
|
\n __global uint* histResult) { // each wg accumulates 1 bin
|
|
|
|
\n
|
|
|
|
\n /* declare variables */
|
|
|
|
\n
|
|
|
|
\n // work indices
|
|
|
|
\n size_t groupId = get_group_id(0);
|
|
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
|
|
\n uint numThreads = get_global_size(0);
|
|
|
|
\n unsigned int hist = 0;
|
|
|
|
\n
|
|
|
|
\n /* accumulate in global memory */
|
|
|
|
\n for ( uint p = 0; p < n; p+=GROUP_SIZE) {
|
|
|
|
\n hist += histBuffer[ (get_group_id(0)*n + p)];
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n /* reduction in local memory */
|
|
|
|
\n // populate local memory
|
|
|
|
\n __local unsigned int localHist[GROUP_SIZE];
|
|
|
|
|
|
|
|
\n localHist[localId] = hist;
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n
|
|
|
|
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
|
|
\n if (localId < stride) {
|
|
|
|
\n hist = localHist[ (localId+stride)];
|
|
|
|
\n }
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n if (localId < stride) {
|
|
|
|
\n localHist[ localId] += hist;
|
|
|
|
\n }
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n if (localId == 0)
|
|
|
|
\n histResult[get_group_id(0)] = localHist[0];
|
|
|
|
\n
|
|
|
|
\n } // kernel_HistogramRectAllChannelsReduction_Grey
|
|
|
|
|
|
|
|
)
|
|
|
|
|
2013-11-12 01:43:13 +08:00
|
|
|
// ThresholdRectToPix Kernel
|
2013-12-10 18:52:54 +08:00
|
|
|
// only supports 4 channels
|
2013-11-12 01:43:13 +08:00
|
|
|
// imageData is input image (24-bits/pixel)
|
|
|
|
// pix is output image (1-bit/pixel)
|
|
|
|
KERNEL(
|
|
|
|
\n#define CHAR_VEC_WIDTH 8 \n
|
|
|
|
\n#define PIXELS_PER_WORD 32 \n
|
|
|
|
\n#define PIXELS_PER_BURST 8 \n
|
|
|
|
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
|
|
|
|
typedef union {
|
|
|
|
uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
|
|
|
|
uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
|
|
|
|
} charVec;
|
|
|
|
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_ThresholdRectToPix(
|
|
|
|
__global const uchar8 *imageData,
|
|
|
|
int height,
|
|
|
|
int width,
|
|
|
|
int wpl, // words per line
|
|
|
|
__global int *thresholds,
|
|
|
|
__global int *hi_values,
|
|
|
|
__global int *pix) {
|
|
|
|
|
|
|
|
// declare variables
|
|
|
|
int pThresholds[NUM_CHANNELS];
|
|
|
|
int pHi_Values[NUM_CHANNELS];
|
|
|
|
for ( int i = 0; i < NUM_CHANNELS; i++) {
|
|
|
|
pThresholds[i] = thresholds[i];
|
|
|
|
pHi_Values[i] = hi_values[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// for each word (32 pixels) in output image
|
|
|
|
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
|
|
|
|
unsigned int word = 0; // all bits start at zero
|
|
|
|
|
|
|
|
// for each burst in word
|
|
|
|
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
|
|
|
|
|
|
|
|
// load burst
|
|
|
|
charVec pixels;
|
|
|
|
for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
|
|
|
|
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// for each pixel in burst
|
|
|
|
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
|
|
|
for ( int c = 0; c < NUM_CHANNELS; c++) {
|
|
|
|
unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
|
|
|
|
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
2013-12-10 18:52:54 +08:00
|
|
|
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pix[w] = word;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// only supports 1 channel
|
|
|
|
typedef union {
|
|
|
|
uchar s[PIXELS_PER_BURST];
|
|
|
|
uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
|
|
|
|
} charVec1;
|
|
|
|
|
|
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
__kernel
|
|
|
|
void kernel_ThresholdRectToPix_OneChan(
|
|
|
|
__global const uchar8 *imageData,
|
|
|
|
int height,
|
|
|
|
int width,
|
|
|
|
int wpl, // words per line
|
|
|
|
__global int *thresholds,
|
|
|
|
__global int *hi_values,
|
|
|
|
__global int *pix) {
|
|
|
|
|
|
|
|
// declare variables
|
|
|
|
int pThresholds[1];
|
|
|
|
int pHi_Values[1];
|
|
|
|
for ( int i = 0; i < 1; i++) {
|
|
|
|
pThresholds[i] = thresholds[i];
|
|
|
|
pHi_Values[i] = hi_values[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// for each word (32 pixels) in output image
|
|
|
|
for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
|
|
|
|
unsigned int word = 0; // all bits start at zero
|
|
|
|
|
|
|
|
// for each burst in word
|
|
|
|
for ( int b = 0; b < BURSTS_PER_WORD; b++) {
|
|
|
|
|
|
|
|
// load burst
|
|
|
|
charVec1 pixels;
|
|
|
|
for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
|
|
|
|
pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// for each pixel in burst
|
|
|
|
for ( int p = 0; p < PIXELS_PER_BURST; p++) {
|
|
|
|
for ( int c = 0; c < 1; c++) {
|
|
|
|
unsigned char pixChan = pixels.s[p + c];
|
|
|
|
if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
|
2013-11-12 01:43:13 +08:00
|
|
|
word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pix[w] = word;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
; // close char*
|
|
|
|
|
|
|
|
#endif // USE_EXTERNAL_KERNEL
|
|
|
|
#endif //_OCL_KERNEL_H_
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
|
|
|
|
|
|
|
// Alternative histogram kernel written to use uchar and different global memory scattered write
|
|
|
|
// was a little better for intel platforms but still not faster then native serial code
|
|
|
|
#if 0
|
|
|
|
/* data layed out as
|
|
|
|
bin0 bin1 bin2...
|
|
|
|
r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
|
|
|
|
*/
|
|
|
|
\n__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
\n __kernel
|
|
|
|
\n void kernel_HistogramRectAllChannels_uchar(
|
|
|
|
\n volatile __global const uchar *data,
|
|
|
|
\n uint numPixels,
|
2014-01-18 02:51:27 +08:00
|
|
|
\n volatile __global uint *histBuffer) {
|
2013-11-12 01:43:13 +08:00
|
|
|
\n
|
|
|
|
\n // for each pixel/channel, accumulate in global memory
|
|
|
|
\n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
|
|
|
|
\n uchar value = data[pc];
|
|
|
|
\n int idx = value*get_global_size(0) + get_global_id(0);
|
|
|
|
\n histBuffer[ idx ]++; // coalesced if same value
|
|
|
|
\n }
|
|
|
|
\n } // kernel_HistogramRectAllChannels
|
|
|
|
\n
|
|
|
|
\n __attribute__((reqd_work_group_size(256, 1, 1)))
|
|
|
|
\n __kernel
|
|
|
|
\n void kernel_HistogramRectAllChannelsReduction_uchar(
|
|
|
|
\n int n, // pixel redundancy that needs to be accumulated = nthreads/4
|
2014-01-18 02:51:27 +08:00
|
|
|
\n __global uint4 *histBuffer,
|
2013-11-12 01:43:13 +08:00
|
|
|
\n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
|
|
|
|
\n
|
|
|
|
\n // declare variables
|
|
|
|
\n int binIdx = get_group_id(0);
|
|
|
|
\n size_t groupId = get_group_id(0);
|
|
|
|
\n size_t localId = get_local_id(0); // 0 -> 256-1
|
|
|
|
\n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
|
|
|
|
\n uint numThreads = get_global_size(0);
|
|
|
|
\n uint4 hist = {0, 0, 0, 0};
|
|
|
|
\n
|
|
|
|
\n // accumulate in register
|
|
|
|
\n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
|
|
|
|
\n hist += histBuffer[binIdx*n+p];
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n // reduction in local memory
|
|
|
|
\n __local uint4 localHist[GROUP_SIZE];
|
|
|
|
\n localHist[localId] = hist;
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n
|
|
|
|
\n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
|
|
|
|
\n if (localId < stride) {
|
|
|
|
\n hist = localHist[ localId+stride];
|
|
|
|
\n }
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n if (localId < stride) {
|
|
|
|
\n localHist[ localId] += hist;
|
|
|
|
\n }
|
|
|
|
\n barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n // write reduction to final result
|
|
|
|
\n if (localId == 0) {
|
|
|
|
\n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
|
|
|
|
\n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
|
|
|
|
\n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
|
|
|
|
\n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
|
|
|
|
\n }
|
|
|
|
\n
|
|
|
|
\n } // kernel_HistogramRectAllChannels
|
|
|
|
#endif
|