#ifndef _OCL_KERNEL_H_
#define _OCL_KERNEL_H_
#ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__ "\n"
// Double precision is a default of spreadsheets
// cl_khr_fp64: Khronos extension
// cl_amd_fp64: AMD extension
// use build option outside to define fp_t
/////////////////////////////////////////////
const char *kernel_src = KERNEL(
\n#ifdef KHR_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
\n#elif AMD_DP_EXTENSION\n
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n
\n#endif\n
__kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
{
    int i = get_global_id(1);
    int j = get_global_id(0);
    int tiffword,rval,gval,bval;

    //Ignore the excess
    if ((i >= h) || (j >= w))
        return;

    tiffword = tiffdata[i * w + j];
    rval = ((tiffword) & 0xff);
    gval = (((tiffword) >> 8) & 0xff);
    bval = (((tiffword) >> 16) & 0xff);
    output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
}
)

KERNEL(
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
                            const int wpl, const int h)
{
    const unsigned int row = get_global_id(1);
    const unsigned int col = get_global_id(0);
    const unsigned int pos = row * wpl + col;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    *(dword + pos) &= ~(*(sword + pos));
}\n
)

KERNEL(
\n__kernel void pixSubtract(__global int *dword, __global int *sword, 
                            const int wpl, const int h, __global int *outword)
{
    const unsigned int row = get_global_id(1);
    const unsigned int col = get_global_id(0);
    const unsigned int pos = row * wpl + col;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
}\n
)

KERNEL(
\n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
                            const int wpl, const int h)
{
    const unsigned int row = get_global_id(1);
    const unsigned int col = get_global_id(0);
    const unsigned int pos = row * wpl + col;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

     *(outword + pos) = *(dword + pos) & (*(sword + pos));
}\n
)

KERNEL(
\n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
                            const int wpl, const int h)
{
    const unsigned int row = get_global_id(1);
    const unsigned int col = get_global_id(0);
    const unsigned int pos = row * wpl + col;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    *(outword + pos) = *(dword + pos) | (*(sword + pos));
}\n
)

KERNEL(
\n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
                            const int wpl, const int h)
{
    const unsigned int pos = get_global_id(0);
    unsigned int prevword, nextword, currword,tempword;
    unsigned int destword;
    const int col = pos % wpl;
    
    //Ignore the execss
    if (pos >= (wpl * h))
        return;
    
    
    currword = *(sword + pos);  
    destword = currword;
    
    //Handle boundary conditions
    if(col==0)
        prevword=0;
    else
        prevword = *(sword + pos - 1);

    if(col==(wpl - 1))
        nextword=0;
    else
        nextword = *(sword + pos + 1);
    
    //Loop unrolled
    
    //1 bit to left and 1 bit to right
        //Get the max value on LHS of every pixel
        tempword = (prevword << (31)) | ((currword >> 1));
        destword |= tempword;
        //Get max value on RHS of every pixel
        tempword = (currword << 1) | (nextword >> (31));
        destword |= tempword;

    //2 bit to left and 2 bit to right
        //Get the max value on LHS of every pixel
        tempword = (prevword << (30)) | ((currword >> 2));
        destword |= tempword;
        //Get max value on RHS of every pixel
        tempword = (currword << 2) | (nextword >> (30));
        destword |= tempword;
    
    
    *(dword + pos) = destword;
    
}\n
)

KERNEL(
\n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
                            const int wpl, const int h)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int tempword;
    unsigned int destword;
    int i;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    destword = *(sword + pos);

    //2 words above
    i = (row - 2) < 0 ? row : (row - 2);
    tempword = *(sword + i*wpl + col);
    destword |= tempword;

    //1 word above
    i = (row - 1) < 0 ? row  : (row - 1);
    tempword = *(sword + i*wpl + col);
    destword |= tempword;

    //1 word below
    i = (row >= (h - 1)) ? row : (row + 1);
    tempword = *(sword + i*wpl + col);
    destword |= tempword;

    //2 words below
    i = (row >= (h - 2)) ? row : (row + 2);
    tempword = *(sword + i*wpl + col);
    destword |= tempword;

    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int parbitsxp, parbitsxn, nwords;
    unsigned int destword, tempword, lastword, currword;
    unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
    int i, j, siter, eiter;
    
    //Ignore the execss
    if (pos >= (wpl*h) || (xn < 1 && xp < 1))
        return;

    currword = *(sword + pos);
    destword = currword;

    parbitsxp = xp & 31;
    parbitsxn = xn & 31;
    nwords = xp >> 5;

    if (parbitsxp > 0)
        nwords += 1;
    else
        parbitsxp = 31;

    siter = (col - nwords);
    eiter = (col + nwords);

    //Get prev word
    if (col==0)
        firstword = 0x0;
    else
        firstword = *(sword + pos - 1);
    
    //Get next word
    if (col == (wpl - 1))
        secondword = 0x0;
    else
        secondword = *(sword + pos + 1);

    //Last partial bits on either side
    for (i = 1; i <= parbitsxp; i++)
    {
        //Get the max value on LHS of every pixel
        tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
        
        destword |= tempword;

        //Get max value on RHS of every pixel
        tempword = (currword << i) | (secondword >> (32 - i));
        destword |= tempword;
    }

    //Return if halfwidth <= 1 word
    if (nwords == 1)
    {
        if (xn == 32)
        {
            destword |= firstword;
        }
        if (xp == 32)
        {
            destword |= secondword;
        }

        *(dword + pos) = destword;
        return;
    }

    if (siter < 0)
        firstword = 0x0;
    else
        firstword = *(sword + row*wpl + siter);

    if (eiter >= wpl)   
        lastword = 0x0;
    else
        lastword = *(sword + row*wpl + eiter);
    
    for ( i = 1; i < nwords; i++)
    {
        //Gets LHS words
        if ((siter + i) < 0)
            secondword = 0x0;
        else
            secondword = *(sword + row*wpl + siter + i);

        lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
        
        firstword = secondword;

        if ((siter + i + 1) < 0)
            secondword = 0x0;
        else
            secondword = *(sword + row*wpl + siter + i + 1);
        
        lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;

        //Gets RHS words
        if ((eiter - i) >= wpl)
            firstword = 0x0;
        else
            firstword = *(sword + row*wpl + eiter - i);
            
        rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);

        lastword = firstword;
        if ((eiter - i - 1) >= wpl)
            firstword = 0x0;
        else
            firstword = *(sword + row*wpl + eiter - i - 1);

        rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);

        for (j = 1; j < 32; j++)
        {
            //OR LHS full words
            tempword = (lprevword << j) | (lnextword >> (32 - j));
            destword |= tempword;

            //OR RHS full words
            tempword = (rprevword << j) | (rnextword >> (32 - j));
            destword |= tempword;
        }

        destword |= lprevword;
        destword |= lnextword;
        destword |= rprevword;
        destword |= rnextword;

        lastword = firstword;
        firstword = secondword;
    }
    
    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
                            const int halfwidth,
                            const int wpl, const int h,
                            const char isEven)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int prevword, nextword, currword,tempword;
    unsigned int destword;
    int i;
    
    //Ignore the execss
    if (pos >= (wpl * h))
        return;

    currword = *(sword + pos);  
    destword = currword;
    
    //Handle boundary conditions
    if(col==0)
        prevword=0;
    else
        prevword = *(sword + pos - 1);

    if(col==(wpl - 1))
        nextword=0;
    else
        nextword = *(sword + pos + 1);
    
    for (i = 1; i <= halfwidth; i++)
    {
        //Get the max value on LHS of every pixel
        if (i == halfwidth && isEven)
        {
            tempword = 0x0;
        }
        else
        {
            tempword = (prevword << (32-i)) | ((currword >> i));
        }

        destword |= tempword;

        //Get max value on RHS of every pixel
        tempword = (currword << i) | (nextword >> (32 - i));
        
        destword |= tempword;
    }

    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
                            const int yp,
                            const int wpl, const int h,
                            const int yn)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int tempword;
    unsigned int destword;
    int i, siter, eiter;
    
    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    destword = *(sword + pos);

    //Set start position and end position considering the boundary conditions
    siter = (row - yn) < 0 ? 0 : (row - yn);
    eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);

    for (i = siter; i <= eiter; i++)
    {
        tempword = *(sword + i*wpl + col);

        destword |= tempword;
    }

    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
                            const int wpl, const int h)
{
    const unsigned int pos = get_global_id(0);
    unsigned int prevword, nextword, currword,tempword;
    unsigned int destword;
    const int col = pos % wpl;
    
    //Ignore the execss
    if (pos >= (wpl * h))
        return;
    
    currword = *(sword + pos);  
    destword = currword;
    
    //Handle boundary conditions
    if(col==0)
        prevword=0xffffffff;
    else
        prevword = *(sword + pos - 1);
    
    if(col==(wpl - 1))
        nextword=0xffffffff;
    else
        nextword = *(sword + pos + 1);
    
    //Loop unrolled
    
    //1 bit to left and 1 bit to right
        //Get the min value on LHS of every pixel
        tempword = (prevword << (31)) | ((currword >> 1));
        destword &= tempword;
        //Get min value on RHS of every pixel
        tempword = (currword << 1) | (nextword >> (31));
        destword &= tempword;

    //2 bit to left and 2 bit to right
        //Get the min value on LHS of every pixel
        tempword = (prevword << (30)) | ((currword >> 2));
        destword &= tempword;
        //Get min value on RHS of every pixel
        tempword = (currword << 2) | (nextword >> (30));
        destword &= tempword;
    
    
    *(dword + pos) = destword;
    
}\n
)

KERNEL(
\n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
                            const int wpl, const int h,
                            const int fwmask, const int lwmask)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int tempword;
    unsigned int destword;
    int i;

    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    destword = *(sword + pos);

    if (row < 2 || row >= (h - 2))
    {
        destword = 0x0;
    }   
    else
    {
        //2 words above
        //i = (row - 2) < 0 ? row : (row - 2);
        i = (row - 2);
        tempword = *(sword + i*wpl + col);
        destword &= tempword;

        //1 word above
        //i = (row - 1) < 0 ? row  : (row - 1);
        i = (row - 1);
        tempword = *(sword + i*wpl + col);
        destword &= tempword;

        //1 word below
        //i = (row >= (h - 1)) ? row : (row + 1);
        i = (row + 1);
        tempword = *(sword + i*wpl + col);
        destword &= tempword;

        //2 words below
        //i = (row >= (h - 2)) ? row : (row + 2);
        i = (row + 2);
        tempword = *(sword + i*wpl + col);
        destword &= tempword;

        if (col == 0) 
        {
            destword &= fwmask;
        }
        if (col == (wpl - 1))
        {
            destword &= lwmask;
        }
    }


    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl, 
                                const int h, const char isAsymmetric, const int rwmask, const int lwmask)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int parbitsxp, parbitsxn, nwords;
    unsigned int destword, tempword, lastword, currword;
    unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
    int i, j, siter, eiter;

    //Ignore the execss
    if (pos >= (wpl*h) || (xn < 1 && xp < 1))
        return;

    currword = *(sword + pos);
    destword = currword;

    parbitsxp = xp & 31;
    parbitsxn = xn & 31;
    nwords = xp >> 5;

    if (parbitsxp > 0)
        nwords += 1;
    else
        parbitsxp = 31;

    siter = (col - nwords);
    eiter = (col + nwords);

    //Get prev word
    if (col==0)
        firstword = 0xffffffff;
    else
        firstword = *(sword + pos - 1);
    
    //Get next word
    if (col == (wpl - 1))
        secondword = 0xffffffff;
    else
        secondword = *(sword + pos + 1);

    //Last partial bits on either side
    for (i = 1; i <= parbitsxp; i++)
    {
        //Get the max value on LHS of every pixel
        tempword = (firstword << (32-i)) | ((currword >> i));
        destword &= tempword;

        //Get max value on RHS of every pixel
        tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
        
        //tempword = (currword << i) | (secondword >> (32 - i));
        destword &= tempword;
    }

    //Return if halfwidth <= 1 word
    if (nwords == 1)
    {
        if (xp == 32)
        {
            destword &= firstword;
        }
        if (xn == 32)
        {
            destword &= secondword;
        }

        //Clear boundary pixels
        if (isAsymmetric)
        {
            if (col == 0)
                destword &= rwmask;
            if (col == (wpl - 1))
                destword &= lwmask;
        }

        *(dword + pos) = destword;
        return;
    }
    
    if (siter < 0)
        firstword = 0xffffffff;
    else
        firstword = *(sword + row*wpl + siter);

    if (eiter >= wpl)   
        lastword = 0xffffffff;
    else
        lastword = *(sword + row*wpl + eiter);
    
    
    for ( i = 1; i < nwords; i++)
    {
        //Gets LHS words
        if ((siter + i) < 0)
            secondword = 0xffffffff;
        else
            secondword = *(sword + row*wpl + siter + i);

        lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
        
        firstword = secondword;

        if ((siter + i + 1) < 0)
            secondword = 0xffffffff;
        else
            secondword = *(sword + row*wpl + siter + i + 1);
        
        lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);

        //Gets RHS words
        if ((eiter - i) >= wpl)
            firstword = 0xffffffff;
        else
            firstword = *(sword + row*wpl + eiter - i);
            
        rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);

        lastword = firstword;
        if ((eiter - i - 1) >= wpl)
            firstword = 0xffffffff;
        else
            firstword = *(sword + row*wpl + eiter - i - 1);

        rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);

        for (j = 0; j < 32; j++)
        {
            //OR LHS full words
            tempword = (lprevword << j) | (lnextword >> (32 - j));
            destword &= tempword;

            //OR RHS full words
            tempword = (rprevword << j) | (rnextword >> (32 - j));
            destword &= tempword;
        }

        destword &= lprevword;
        destword &= lnextword;
        destword &= rprevword;
        destword &= rnextword;

        lastword = firstword;
        firstword = secondword;
    }
    
    if (isAsymmetric)
    {
        //Clear boundary pixels
        if (col < (nwords - 1))
            destword = 0x0;
        else if (col == (nwords - 1))
            destword &= rwmask;
        else if (col > (wpl - nwords))
            destword = 0x0;
        else if (col == (wpl - nwords))
            destword &= lwmask;
    }

    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
                            const int halfwidth, const int wpl, 
                            const int h, const char clearBoundPixH, 
                            const int rwmask, const int lwmask,
                            const char isEven)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int prevword, nextword, currword,tempword, destword;
    int i;

    //Ignore the execss
    if (pos >= (wpl * h))
        return;

    currword = *(sword + pos);  
    destword = currword;
    
    //Handle boundary conditions
    if(col==0)
        prevword=0xffffffff;
    else
        prevword = *(sword + pos - 1);
    
    if(col==(wpl - 1))
        nextword=0xffffffff;
    else
        nextword = *(sword + pos + 1);
    
    for (i = 1; i <= halfwidth; i++)
    {
        //Get the min value on LHS of every pixel
        tempword = (prevword << (32-i)) | ((currword >> i));
        
        destword &= tempword;

        //Get min value on RHS of every pixel
        if (i == halfwidth && isEven)
        {
            tempword = 0xffffffff;
        }
        else
        {
            tempword = (currword << i) | (nextword >> (32 - i));
        }

        destword &= tempword;
    }

    if (clearBoundPixH)
    {
        if (col == 0) 
        {
            destword &= rwmask;
        }
        else if (col == (wpl - 1))
        {
            destword &= lwmask;
        }
    }

    *(dword + pos) = destword;
}\n
)

KERNEL(
\n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
                            const int yp, 
                            const int wpl, const int h,
                            const char clearBoundPixV, const int yn)
{
    const int col = get_global_id(0);
    const int row = get_global_id(1);
    const unsigned int pos = row * wpl + col;
    unsigned int tempword, destword;
    int i, siter, eiter;
    
    //Ignore the execss
    if (row >= h || col >= wpl)
        return;

    destword = *(sword + pos);

    //Set start position and end position considering the boundary conditions
    siter = (row - yp) < 0 ? 0 : (row - yp);
    eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);

    for (i = siter; i <= eiter; i++)
    {
        tempword = *(sword + i*wpl + col);

        destword &= tempword;
    }

    //Clear boundary pixels
    if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
    {   
        destword = 0x0;
    }

    *(dword + pos) = destword;
}\n
)

// HistogramRect Kernel: Accumulate
// assumes 4 channels, i.e., bytes_per_pixel = 4
// assumes number of pixels is multiple of 8
// data is layed out as
// ch0                                           ch1 ...
// bin0          bin1            bin2...         bin0...
// rpt0,1,2...256  rpt0,1,2...
KERNEL(
\n#define HIST_REDUNDANCY 256\n
\n#define GROUP_SIZE 256\n
\n#define HIST_SIZE 256\n
\n#define NUM_CHANNELS 4\n
\n#define HR_UNROLL_SIZE 8 \n
\n#define HR_UNROLL_TYPE uchar8 \n

__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannels(
    __global const uchar8 *data,
    uint numPixels,
    __global uint *histBuffer) {

    // declare variables
    uchar8 pixels;
    int threadOffset = get_global_id(0)%HIST_REDUNDANCY;

    // for each pixel/channel, accumulate in global memory
    for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
        pixels = data[pc];
        //                       channel                        bin                         thread
        atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
        atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
        atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
        atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
        atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
        atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
        atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
        atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
    }
}
)

KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannel(
    __global const uchar8 *data,
    uint numPixels,
    __global uint *histBuffer) {

    // declare variables
    uchar8 pixels;
    int threadOffset = get_global_id(0)%HIST_REDUNDANCY;

    // for each pixel/channel, accumulate in global memory
    for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
        pixels = data[pc];
        //                        bin                         thread
        atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
        atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
    }
}
)


KERNEL(
// unused
\n  __attribute__((reqd_work_group_size(256, 1, 1)))
\n  __kernel
\n  void kernel_HistogramRectAllChannels_Grey(
\n      __global const uchar* data,
\n      uint numPixels,
\n        __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
\n  
\n      /* declare variables */
\n  
\n      // work indices
\n      size_t groupId = get_group_id(0);
\n      size_t localId = get_local_id(0); // 0 -> 256-1
\n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n      uint numThreads = get_global_size(0);
\n  
\n      /* accumulate in global memory */
\n      for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
\n          uchar value = data[ pc ];
\n          int idx = value * get_global_size(0) + get_global_id(0);
\n           histBuffer[ idx ]++;
\n          
\n      }
\n      
\n  } // kernel_HistogramRectAllChannels_Grey

)

// HistogramRect Kernel: Reduction
// only supports 4 channels
// each work group handles a single channel of a single histogram bin
KERNEL(
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectAllChannelsReduction(
    int n, // unused pixel redundancy
    __global uint *histBuffer,
    __global int* histResult) {

    // declare variables
    int channel = get_group_id(0)/HIST_SIZE;
    int bin     = get_group_id(0)%HIST_SIZE;
    int value = 0;

    // accumulate in register
    for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
        value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
    }

    // reduction in local memory
    __local int localHist[GROUP_SIZE];
    localHist[get_local_id(0)] = value;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
        if (get_local_id(0) < stride) {
            value = localHist[ get_local_id(0)+stride];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < stride) {
            localHist[ get_local_id(0)] += value;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // write reduction to final result
    if (get_local_id(0) == 0) {
        histResult[get_group_id(0)] = localHist[0];
    }
} // kernel_HistogramRectAllChannels
)


KERNEL(
// NUM_CHANNELS = 1
__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_HistogramRectOneChannelReduction(
    int n, // unused pixel redundancy
    __global uint *histBuffer,
    __global int* histResult) {

    // declare variables
    // int channel = get_group_id(0)/HIST_SIZE;
    int bin     = get_group_id(0)%HIST_SIZE;
    int value = 0;

    // accumulate in register
    for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
        value += histBuffer[ bin*HIST_REDUNDANCY+i];
    }

    // reduction in local memory
    __local int localHist[GROUP_SIZE];
    localHist[get_local_id(0)] = value;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
        if (get_local_id(0) < stride) {
            value = localHist[ get_local_id(0)+stride];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < stride) {
            localHist[ get_local_id(0)] += value;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // write reduction to final result
    if (get_local_id(0) == 0) {
        histResult[get_group_id(0)] = localHist[0];
    }
} // kernel_HistogramRectOneChannelReduction
)


KERNEL(
// unused
  // each work group (x256) handles a histogram bin 
\n  __attribute__((reqd_work_group_size(256, 1, 1)))
\n  __kernel
\n  void kernel_HistogramRectAllChannelsReduction_Grey(
\n      int n, // pixel redundancy that needs to be accumulated
\n      __global uint *histBuffer,
\n      __global uint* histResult) { // each wg accumulates 1 bin
\n  
\n      /* declare variables */
\n  
\n      // work indices
\n      size_t groupId = get_group_id(0);
\n      size_t localId = get_local_id(0); // 0 -> 256-1
\n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n      uint numThreads = get_global_size(0);
\n        unsigned int hist = 0;
\n  
\n      /* accumulate in global memory */
\n      for ( uint p = 0; p < n; p+=GROUP_SIZE) {
\n            hist += histBuffer[ (get_group_id(0)*n + p)];
\n      }
\n  
\n      /* reduction in local memory */
\n      // populate local memory
\n      __local unsigned int localHist[GROUP_SIZE];

\n      localHist[localId] = hist;
\n      barrier(CLK_LOCAL_MEM_FENCE);
\n  
\n      for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n          if (localId < stride) {
\n              hist = localHist[ (localId+stride)];
\n          }
\n          barrier(CLK_LOCAL_MEM_FENCE);
\n          if (localId < stride) {
\n              localHist[ localId] += hist;
\n          }
\n          barrier(CLK_LOCAL_MEM_FENCE);
\n      }
\n  
\n      if (localId == 0)
\n          histResult[get_group_id(0)] = localHist[0];
\n  
\n  } // kernel_HistogramRectAllChannelsReduction_Grey

)

// ThresholdRectToPix Kernel
// only supports 4 channels
// imageData is input image (24-bits/pixel)
// pix is output image (1-bit/pixel)
KERNEL(
\n#define CHAR_VEC_WIDTH 8 \n
\n#define PIXELS_PER_WORD 32 \n
\n#define PIXELS_PER_BURST 8 \n
\n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
 typedef union {
  uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
  uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
 } charVec;

__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix(
    __global const uchar8 *imageData,
    int height,
    int width,
    int wpl, // words per line
    __global int *thresholds,
    __global int *hi_values,
    __global int *pix) {

    // declare variables
    int pThresholds[NUM_CHANNELS];
    int pHi_Values[NUM_CHANNELS];
    for ( int i = 0; i < NUM_CHANNELS; i++) {
        pThresholds[i] = thresholds[i];
        pHi_Values[i] = hi_values[i];
    }

    // for each word (32 pixels) in output image
    for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
        unsigned int word = 0; // all bits start at zero

        // for each burst in word
        for ( int b = 0; b < BURSTS_PER_WORD; b++) {

            // load burst
            charVec pixels;
            for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
                pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH)  + i];
            }

            // for each pixel in burst
            for ( int p = 0; p < PIXELS_PER_BURST; p++) {
                for ( int c = 0; c < NUM_CHANNELS; c++) {
                    unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
                    if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
                        word |=  (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
                    }
                }
            }
        }
        pix[w] = word;
    }
}

// only supports 1 channel
 typedef union {
  uchar s[PIXELS_PER_BURST];
  uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
 } charVec1;

__attribute__((reqd_work_group_size(256, 1, 1)))
__kernel
void kernel_ThresholdRectToPix_OneChan(
    __global const uchar8 *imageData,
    int height,
    int width,
    int wpl, // words per line
    __global int *thresholds,
    __global int *hi_values,
    __global int *pix) {

    // declare variables
    int pThresholds[1];
    int pHi_Values[1];
    for ( int i = 0; i < 1; i++) {
        pThresholds[i] = thresholds[i];
        pHi_Values[i] = hi_values[i];
    }

    // for each word (32 pixels) in output image
    for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
        unsigned int word = 0; // all bits start at zero

        // for each burst in word
        for ( int b = 0; b < BURSTS_PER_WORD; b++) {

            // load burst
            charVec1 pixels;
            for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
                pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH)  + i];
            }

            // for each pixel in burst
            for ( int p = 0; p < PIXELS_PER_BURST; p++) {
                for ( int c = 0; c < 1; c++) {
                    unsigned char pixChan = pixels.s[p + c];
                    if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
                        word |=  (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
                    }
                }
            }
        }
        pix[w] = word;
    }
}
)

 ; // close char*

#endif // USE_EXTERNAL_KERNEL
#endif //_OCL_KERNEL_H_
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

// Alternative histogram kernel written to use uchar and different global memory scattered write
// was a little better for intel platforms but still not faster then native serial code
#if 0
/*  data layed out as
    bin0                                        bin1                            bin2...
    r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
*/
\n__attribute__((reqd_work_group_size(256, 1, 1)))
\n  __kernel
\n  void kernel_HistogramRectAllChannels_uchar(
\n      volatile __global const uchar  *data,
\n                              uint   numPixels,
\n      volatile __global       uint   *histBuffer) {
\n      
\n      // for each pixel/channel, accumulate in global memory
\n      for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
\n          uchar value = data[pc];
\n          int idx = value*get_global_size(0) + get_global_id(0);
\n          histBuffer[ idx ]++; // coalesced if same value
\n      }
\n  } // kernel_HistogramRectAllChannels
\n
\n  __attribute__((reqd_work_group_size(256, 1, 1)))
\n  __kernel
\n  void kernel_HistogramRectAllChannelsReduction_uchar(
\n      int n, // pixel redundancy that needs to be accumulated = nthreads/4
\n      __global uint4 *histBuffer,
\n      __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
\n  
\n      // declare variables
\n      int binIdx     = get_group_id(0);
\n      size_t groupId = get_group_id(0);
\n      size_t localId = get_local_id(0); // 0 -> 256-1
\n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
\n      uint numThreads = get_global_size(0);
\n      uint4 hist = {0, 0, 0, 0};
\n
\n      // accumulate in register
\n      for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
\n          hist += histBuffer[binIdx*n+p];
\n      }
\n  
\n      // reduction in local memory
\n      __local uint4 localHist[GROUP_SIZE];
\n      localHist[localId] = hist;
\n      barrier(CLK_LOCAL_MEM_FENCE);
\n  
\n      for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
\n          if (localId < stride) {
\n              hist = localHist[ localId+stride];
\n          }
\n          barrier(CLK_LOCAL_MEM_FENCE);
\n          if (localId < stride) {
\n              localHist[ localId] += hist;
\n          }
\n          barrier(CLK_LOCAL_MEM_FENCE);
\n      }
\n
\n      // write reduction to final result
\n      if (localId == 0) {
\n          histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
\n          histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
\n          histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
\n          histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
\n      }
\n  
\n  } // kernel_HistogramRectAllChannels
#endif