mirror of
https://github.com/opencv/opencv.git
synced 2025-01-06 02:08:12 +08:00
954 lines
37 KiB
Common Lisp
954 lines
37 KiB
Common Lisp
#if defined (DOUBLE_SUPPORT)
|
|
|
|
#ifdef cl_khr_fp64
|
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
|
#elif defined (cl_amd_fp64)
|
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
|
#endif
|
|
typedef double T;
|
|
|
|
#else
|
|
typedef float double;
|
|
typedef float4 double4;
|
|
typedef long T;
|
|
#define convert_double4 convert_float4
|
|
#endif
|
|
//#pragma OPENCL EXTENSION cl_amd_printf:enable
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
#define DST_ROW_A00 0
|
|
#define DST_ROW_A10 1
|
|
#define DST_ROW_A01 2
|
|
#define DST_ROW_A20 3
|
|
#define DST_ROW_A11 4
|
|
#define DST_ROW_A02 5
|
|
#define DST_ROW_A30 6
|
|
#define DST_ROW_A21 7
|
|
#define DST_ROW_A12 8
|
|
#define DST_ROW_A03 9
|
|
|
|
__kernel void icvContourMoments(int contour_total,
|
|
__global float* reader_oclmat_data,
|
|
__global T* dst_a,
|
|
int dst_step)
|
|
{
|
|
T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
|
|
int idx = get_global_id(0);
|
|
|
|
if (idx < 0 || idx >= contour_total)
|
|
return;
|
|
|
|
xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
|
|
yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
|
|
xi_12 = xi_1 * xi_1;
|
|
yi_12 = yi_1 * yi_1;
|
|
|
|
if(idx == contour_total - 1)
|
|
{
|
|
xi = (T)(*(reader_oclmat_data));
|
|
yi = (T)(*(reader_oclmat_data + 1));
|
|
}
|
|
else
|
|
{
|
|
xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
|
|
yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
|
|
}
|
|
|
|
xi2 = xi * xi;
|
|
yi2 = yi * yi;
|
|
dxy = xi_1 * yi - xi * yi_1;
|
|
xii_1 = xi_1 + xi;
|
|
yii_1 = yi_1 + yi;
|
|
|
|
dst_step /= sizeof(T);
|
|
*( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
|
|
*( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
|
|
*( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1;
|
|
*( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
|
|
*( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
|
|
*( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
|
|
*( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
|
|
*( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
|
|
*( dst_a + DST_ROW_A21 * dst_step + idx) =
|
|
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
|
|
xi2 * (yi_1 + 3 * yi));
|
|
*( dst_a + DST_ROW_A12 * dst_step + idx) =
|
|
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
|
|
yi2 * (xi_1 + 3 * xi));
|
|
}
|
|
//#endif
|
|
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
|
__global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03,
|
|
int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
|
|
{
|
|
uchar tmp_coi[16]; // get the coi data
|
|
uchar16 tmp[16];
|
|
int VLEN_C = 16; // vector length of uchar
|
|
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int wgidy = get_group_id(0);
|
|
int wgidx = get_group_id(1);
|
|
int lidy = get_local_id(0);
|
|
int lidx = get_local_id(1);
|
|
int y = wgidy*TILE_SIZE; // vector length of uchar
|
|
int x = wgidx*TILE_SIZE; // vector length of uchar
|
|
int kcn = (cn==2)?2:4;
|
|
int rstep = min(src_step, TILE_SIZE);
|
|
tileSize_height = min(TILE_SIZE, src_rows - y);
|
|
tileSize_width = min(TILE_SIZE, src_cols - x);
|
|
|
|
if( tileSize_width < TILE_SIZE )
|
|
for(int i = tileSize_width; i < rstep; i++ )
|
|
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
|
|
if( coi > 0 ) //channel of interest
|
|
for(int i = 0; i < tileSize_width; i += VLEN_C)
|
|
{
|
|
for(int j=0; j<VLEN_C; j++)
|
|
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
|
|
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
|
|
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
|
|
}
|
|
else
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C)
|
|
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
|
|
uchar16 zero = (uchar16)(0);
|
|
uchar16 full = (uchar16)(255);
|
|
if( binary )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C)
|
|
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
|
|
double mom[10];
|
|
__local int m[10][128];
|
|
if(lidy == 0)
|
|
for(int i=0; i<10; i++)
|
|
for(int j=0; j<128; j++)
|
|
m[i][j]=0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
int lm[10] = {0};
|
|
int16 x0 = (int16)(0);
|
|
int16 x1 = (int16)(0);
|
|
int16 x2 = (int16)(0);
|
|
int16 x3 = (int16)(0);
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
|
|
{
|
|
int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
|
|
int16 p = convert_int16(tmp[xt/VLEN_C]);
|
|
int16 xp = v_xt * p, xxp = xp *v_xt;
|
|
x0 += p;
|
|
x1 += xp;
|
|
x2 += xxp;
|
|
x3 += xxp * v_xt;
|
|
}
|
|
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
|
|
int py = lidy * ((int)x0.s0);
|
|
int sy = lidy*lidy;
|
|
int bheight = min(tileSize_height, TILE_SIZE/2);
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
|
{
|
|
m[9][lidy-bheight] = ((int)py) * sy; // m03
|
|
m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
|
|
m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
|
|
m[6][lidy-bheight] = x3.s0; // m30
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
|
m[3][lidy-bheight] = x2.s0; // m20
|
|
m[2][lidy-bheight] = py; // m01
|
|
m[1][lidy-bheight] = x1.s0; // m10
|
|
m[0][lidy-bheight] = x0.s0; // m00
|
|
}
|
|
else if(lidy < bheight)
|
|
{
|
|
lm[9] = ((int)py) * sy; // m03
|
|
lm[8] = ((int)x1.s0) * sy; // m12
|
|
lm[7] = ((int)x2.s0) * lidy; // m21
|
|
lm[6] = x3.s0; // m30
|
|
lm[5] = x0.s0 * sy; // m02
|
|
lm[4] = x1.s0 * lidy; // m11
|
|
lm[3] = x2.s0; // m20
|
|
lm[2] = py; // m01
|
|
lm[1] = x1.s0; // m10
|
|
lm[0] = x0.s0; // m00
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for( int j = bheight; j >= 1; j = j/2 )
|
|
{
|
|
if(lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
lm[i] = lm[i] + m[i][lidy];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(lidy >= j/2&&lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
m[i][lidy-j/2] = lm[i];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(lidy == 0&&lidx == 0)
|
|
{
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] = (double)lm[mt];
|
|
if(binary)
|
|
{
|
|
double s = 1./255;
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] *= s;
|
|
}
|
|
double xm = x * mom[0], ym = y * mom[0];
|
|
|
|
// accumulate moments computed in each tile
|
|
|
|
// + m00 ( = m00' )
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
|
|
|
|
// + m10 ( = m10' + x*m00' )
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
|
|
|
|
// + m01 ( = m01' + y*m00' )
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
|
}
|
|
}
|
|
//#endif
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03)
|
|
{
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int block_y = src_rows/tile_height;
|
|
int block_x = src_cols/tile_width;
|
|
int block_num;
|
|
|
|
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
|
|
block_y ++;
|
|
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
|
|
block_x ++;
|
|
block_num = block_y * block_x;
|
|
__local double dst_sum[10][128];
|
|
if(gidy<128-block_num)
|
|
for(int i=0; i<10; i++)
|
|
dst_sum[i][gidy+block_num]=0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(gidy<block_num)
|
|
{
|
|
dst_sum[0][gidy] = dst_m00[gidy];
|
|
dst_sum[1][gidy] = dst_m10[gidy];
|
|
dst_sum[2][gidy] = dst_m01[gidy];
|
|
dst_sum[3][gidy] = dst_m20[gidy];
|
|
dst_sum[4][gidy] = dst_m11[gidy];
|
|
dst_sum[5][gidy] = dst_m02[gidy];
|
|
dst_sum[6][gidy] = dst_m30[gidy];
|
|
dst_sum[7][gidy] = dst_m21[gidy];
|
|
dst_sum[8][gidy] = dst_m12[gidy];
|
|
dst_sum[9][gidy] = dst_m03[gidy];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for(int lsize=64; lsize>0; lsize>>=1)
|
|
{
|
|
if(gidy<lsize)
|
|
{
|
|
int lsize2 = gidy + lsize;
|
|
for(int i=0; i<10; i++)
|
|
dst_sum[i][gidy] += dst_sum[i][lsize2];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(gidy==0)
|
|
for(int i=0; i<10; i++)
|
|
sum[i] = dst_sum[i][0];
|
|
}
|
|
//#endif
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
|
__global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03,
|
|
int dst_cols, int dst_step,
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
|
{
|
|
ushort tmp_coi[8]; // get the coi data
|
|
ushort8 tmp[32];
|
|
int VLEN_US = 8; // vector length of ushort
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int wgidy = get_group_id(0);
|
|
int wgidx = get_group_id(1);
|
|
int lidy = get_local_id(0);
|
|
int lidx = get_local_id(1);
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel
|
|
int kcn = (cn==2)?2:4;
|
|
int rstep = min(src_step/2, TILE_SIZE);
|
|
tileSize_height = min(TILE_SIZE, src_rows - y);
|
|
tileSize_width = min(TILE_SIZE, src_cols -x);
|
|
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
|
|
for(int i=tileSize_width; i < rstep; i++ )
|
|
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
|
if( coi > 0 )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
|
{
|
|
for(int j=0; j<VLEN_US; j++)
|
|
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
|
|
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
|
}
|
|
else
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
|
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
|
|
ushort8 zero = (ushort8)(0);
|
|
ushort8 full = (ushort8)(255);
|
|
if( binary )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
|
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
|
|
double mom[10];
|
|
__local long m[10][128];
|
|
if(lidy == 0)
|
|
for(int i=0; i<10; i++)
|
|
for(int j=0; j<128; j++)
|
|
m[i][j]=0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
long lm[10] = {0};
|
|
int8 x0 = (int8)(0);
|
|
int8 x1 = (int8)(0);
|
|
int8 x2 = (int8)(0);
|
|
long8 x3 = (long8)(0);
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
|
|
{
|
|
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
|
|
int8 p = convert_int8(tmp[xt/VLEN_US]);
|
|
int8 xp = v_xt * p, xxp = xp * v_xt;
|
|
x0 += p;
|
|
x1 += xp;
|
|
x2 += xxp;
|
|
x3 += convert_long8(xxp) *convert_long8(v_xt);
|
|
}
|
|
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
|
|
|
|
int py = lidy * x0.s0, sy = lidy*lidy;
|
|
int bheight = min(tileSize_height, TILE_SIZE/2);
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
|
{
|
|
m[9][lidy-bheight] = ((long)py) * sy; // m03
|
|
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
|
|
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
|
|
m[6][lidy-bheight] = x3.s0; // m30
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
|
m[3][lidy-bheight] = x2.s0; // m20
|
|
m[2][lidy-bheight] = py; // m01
|
|
m[1][lidy-bheight] = x1.s0; // m10
|
|
m[0][lidy-bheight] = x0.s0; // m00
|
|
}
|
|
else if(lidy < bheight)
|
|
{
|
|
lm[9] = ((long)py) * sy; // m03
|
|
lm[8] = ((long)x1.s0) * sy; // m12
|
|
lm[7] = ((long)x2.s0) * lidy; // m21
|
|
lm[6] = x3.s0; // m30
|
|
lm[5] = x0.s0 * sy; // m02
|
|
lm[4] = x1.s0 * lidy; // m11
|
|
lm[3] = x2.s0; // m20
|
|
lm[2] = py; // m01
|
|
lm[1] = x1.s0; // m10
|
|
lm[0] = x0.s0; // m00
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
|
{
|
|
if(lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
lm[i] = lm[i] + m[i][lidy];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(lidy >= j/2&&lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
m[i][lidy-j/2] = lm[i];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(lidy == 0&&lidx == 0)
|
|
{
|
|
for(int mt = 0; mt < 10; mt++ )
|
|
mom[mt] = (double)lm[mt];
|
|
|
|
if(binary)
|
|
{
|
|
double s = 1./255;
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] *= s;
|
|
}
|
|
|
|
double xm = x *mom[0], ym = y * mom[0];
|
|
|
|
// accumulate moments computed in each tile
|
|
|
|
// + m00 ( = m00' )
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
|
|
|
|
// + m10 ( = m10' + x*m00' )
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
|
|
|
|
// + m01 ( = m01' + y*m00' )
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
|
}
|
|
}
|
|
//#endif
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
|
__global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03,
|
|
int dst_cols, int dst_step,
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
|
{
|
|
short tmp_coi[8]; // get the coi data
|
|
short8 tmp[32];
|
|
int VLEN_S =8; // vector length of short
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int wgidy = get_group_id(0);
|
|
int wgidx = get_group_id(1);
|
|
int lidy = get_local_id(0);
|
|
int lidx = get_local_id(1);
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel
|
|
int kcn = (cn==2)?2:4;
|
|
int rstep = min(src_step/2, TILE_SIZE);
|
|
tileSize_height = min(TILE_SIZE, src_rows - y);
|
|
tileSize_width = min(TILE_SIZE, src_cols -x);
|
|
if(tileSize_width < TILE_SIZE)
|
|
for(int i = tileSize_width; i < rstep; i++ )
|
|
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
|
if( coi > 0 )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
|
{
|
|
for(int j=0; j<VLEN_S; j++)
|
|
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
|
|
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
|
}
|
|
else
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
|
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
|
|
short8 zero = (short8)(0);
|
|
short8 full = (short8)(255);
|
|
if( binary )
|
|
for(int i=0; i < tileSize_width; i+=(VLEN_S))
|
|
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
|
|
|
|
double mom[10];
|
|
__local long m[10][128];
|
|
if(lidy == 0)
|
|
for(int i=0; i<10; i++)
|
|
for(int j=0; j<128; j++)
|
|
m[i][j]=0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
long lm[10] = {0};
|
|
int8 x0 = (int8)(0);
|
|
int8 x1 = (int8)(0);
|
|
int8 x2 = (int8)(0);
|
|
long8 x3 = (long8)(0);
|
|
for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
|
|
{
|
|
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
|
|
int8 p = convert_int8(tmp[xt/VLEN_S]);
|
|
int8 xp = v_xt * p, xxp = xp * v_xt;
|
|
x0 += p;
|
|
x1 += xp;
|
|
x2 += xxp;
|
|
x3 += convert_long8(xxp) * convert_long8(v_xt);
|
|
}
|
|
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
|
|
|
|
int py = lidy * x0.s0, sy = lidy*lidy;
|
|
int bheight = min(tileSize_height, TILE_SIZE/2);
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
|
{
|
|
m[9][lidy-bheight] = ((long)py) * sy; // m03
|
|
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
|
|
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
|
|
m[6][lidy-bheight] = x3.s0; // m30
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
|
m[3][lidy-bheight] = x2.s0; // m20
|
|
m[2][lidy-bheight] = py; // m01
|
|
m[1][lidy-bheight] = x1.s0; // m10
|
|
m[0][lidy-bheight] = x0.s0; // m00
|
|
}
|
|
else if(lidy < bheight)
|
|
{
|
|
lm[9] = ((long)py) * sy; // m03
|
|
lm[8] = ((long)(x1.s0)) * sy; // m12
|
|
lm[7] = ((long)(x2.s0)) * lidy; // m21
|
|
lm[6] = x3.s0; // m30
|
|
lm[5] = x0.s0 * sy; // m02
|
|
lm[4] = x1.s0 * lidy; // m11
|
|
lm[3] = x2.s0; // m20
|
|
lm[2] = py; // m01
|
|
lm[1] = x1.s0; // m10
|
|
lm[0] = x0.s0; // m00
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for( int j = TILE_SIZE/2; j >=1; j = j/2 )
|
|
{
|
|
if(lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
lm[i] = lm[i] + m[i][lidy];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(lidy >= j/2&&lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
m[i][lidy-j/2] = lm[i];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(lidy ==0 &&lidx ==0)
|
|
{
|
|
for(int mt = 0; mt < 10; mt++ )
|
|
mom[mt] = (double)lm[mt];
|
|
|
|
if(binary)
|
|
{
|
|
double s = 1./255;
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] *= s;
|
|
}
|
|
|
|
double xm = x * mom[0], ym = y*mom[0];
|
|
|
|
// accumulate moments computed in each tile
|
|
|
|
// + m00 ( = m00' )
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
|
|
|
|
// + m10 ( = m10' + x*m00' )
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
|
|
|
|
// + m01 ( = m01' + y*m00' )
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
|
}
|
|
}
|
|
//#endif
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
|
__global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03,
|
|
int dst_cols, int dst_step,
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
|
{
|
|
float tmp_coi[4]; // get the coi data
|
|
float4 tmp[64] ;
|
|
int VLEN_F = 4; // vector length of float
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int wgidy = get_group_id(0);
|
|
int wgidx = get_group_id(1);
|
|
int lidy = get_local_id(0);
|
|
int lidx = get_local_id(1);
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel
|
|
int kcn = (cn==2)?2:4;
|
|
int rstep = min(src_step/4, TILE_SIZE);
|
|
tileSize_height = min(TILE_SIZE, src_rows - y);
|
|
tileSize_width = min(TILE_SIZE, src_cols -x);
|
|
if(tileSize_width < TILE_SIZE)
|
|
for(int i = tileSize_width; i < rstep; i++ )
|
|
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
|
|
if( coi > 0 )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
|
{
|
|
for(int j=0; j<4; j++)
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
|
|
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
|
}
|
|
else
|
|
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
|
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
|
|
float4 zero = (float4)(0);
|
|
float4 full = (float4)(255);
|
|
if( binary )
|
|
for(int i=0; i < tileSize_width; i+=4)
|
|
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
|
|
double mom[10];
|
|
__local double m[10][128];
|
|
if(lidy == 0)
|
|
for(int i = 0; i < 10; i ++)
|
|
for(int j = 0; j < 128; j ++)
|
|
m[i][j] = 0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
double lm[10] = {0};
|
|
double4 x0 = (double4)(0);
|
|
double4 x1 = (double4)(0);
|
|
double4 x2 = (double4)(0);
|
|
double4 x3 = (double4)(0);
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
|
|
{
|
|
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
|
|
double4 p = convert_double4(tmp[xt/VLEN_F]);
|
|
double4 xp = v_xt * p, xxp = xp * v_xt;
|
|
x0 += p;
|
|
x1 += xp;
|
|
x2 += xxp;
|
|
x3 += xxp * v_xt;
|
|
}
|
|
x0.s0 += x0.s1 + x0.s2 + x0.s3;
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3;
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3;
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3;
|
|
/*
|
|
double py = lidy * x0.s0, sy = lidy*lidy;
|
|
int bheight = min(tileSize_height, TILE_SIZE/2);
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
|
{
|
|
m[9][lidy-bheight] = ((double)py) * sy; // m03
|
|
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
|
|
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
|
|
m[6][lidy-bheight] = x3.s0; // m30
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
|
m[3][lidy-bheight] = x2.s0; // m20
|
|
m[2][lidy-bheight] = py; // m01
|
|
m[1][lidy-bheight] = x1.s0; // m10
|
|
m[0][lidy-bheight] = x0.s0; // m00
|
|
}
|
|
else if(lidy < bheight)
|
|
{
|
|
lm[9] = ((double)py) * sy; // m03
|
|
lm[8] = ((double)x1.s0) * sy; // m12
|
|
lm[7] = ((double)x2.s0) * lidy; // m21
|
|
lm[6] = x3.s0; // m30
|
|
lm[5] = x0.s0 * sy; // m02
|
|
lm[4] = x1.s0 * lidy; // m11
|
|
lm[3] = x2.s0; // m20
|
|
lm[2] = py; // m01
|
|
lm[1] = x1.s0; // m10
|
|
lm[0] = x0.s0; // m00
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
|
{
|
|
if(lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
lm[i] = lm[i] + m[i][lidy];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(lidy >= j/2&&lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
m[i][lidy-j/2] = lm[i];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(lidy == 0&&lidx == 0)
|
|
{
|
|
for(int mt = 0; mt < 10; mt++ )
|
|
mom[mt] = (double)lm[mt];
|
|
|
|
if(binary)
|
|
{
|
|
double s = 1./255;
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] *= s;
|
|
}
|
|
|
|
double xm = x * mom[0], ym = y * mom[0];
|
|
|
|
// accumulate moments computed in each tile
|
|
|
|
// + m00 ( = m00' )
|
|
dst_m00[wgidy*dst_cols+wgidx]= mom[0];
|
|
|
|
// + m10 ( = m10' + x*m00' )
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
|
|
|
|
// + m01 ( = m01' + y*m00' )
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
|
dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
|
dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
|
dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
|
}*/
|
|
}
|
|
//#endif
|
|
//#if defined (DOUBLE_SUPPORT)
|
|
__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
|
__global double* dst_m00,
|
|
__global double* dst_m10,
|
|
__global double* dst_m01,
|
|
__global double* dst_m20,
|
|
__global double* dst_m11,
|
|
__global double* dst_m02,
|
|
__global double* dst_m30,
|
|
__global double* dst_m21,
|
|
__global double* dst_m12,
|
|
__global double* dst_m03,
|
|
int dst_cols, int dst_step,
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
|
{
|
|
double tmp_coi[4]; // get the coi data
|
|
double4 tmp[64];
|
|
int VLEN_D = 4; // length of vetor
|
|
int gidy = get_global_id(0);
|
|
int gidx = get_global_id(1);
|
|
int wgidy = get_group_id(0);
|
|
int wgidx = get_group_id(1);
|
|
int lidy = get_local_id(0);
|
|
int lidx = get_local_id(1);
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel
|
|
int kcn = (cn==2)?2:4;
|
|
int rstep = min(src_step/8, TILE_SIZE);
|
|
tileSize_height = min(TILE_SIZE, src_rows - y);
|
|
tileSize_width = min(TILE_SIZE, src_cols - x);
|
|
|
|
if(tileSize_width < TILE_SIZE)
|
|
for(int i = tileSize_width; i < rstep; i++ )
|
|
*((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
|
|
if( coi > 0 )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
|
{
|
|
for(int j=0; j<4; j++)
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
|
|
tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
|
}
|
|
else
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
|
tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
|
|
double4 zero = (double4)(0);
|
|
double4 full = (double4)(255);
|
|
if( binary )
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
|
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
|
|
double mom[10];
|
|
__local double m[10][128];
|
|
if(lidy == 0)
|
|
for(int i=0; i<10; i++)
|
|
for(int j=0; j<128; j++)
|
|
m[i][j]=0;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
double lm[10] = {0};
|
|
double4 x0 = (double4)(0);
|
|
double4 x1 = (double4)(0);
|
|
double4 x2 = (double4)(0);
|
|
double4 x3 = (double4)(0);
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
|
|
{
|
|
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
|
|
double4 p = tmp[xt/VLEN_D];
|
|
double4 xp = v_xt * p, xxp = xp * v_xt;
|
|
x0 += p;
|
|
x1 += xp;
|
|
x2 += xxp;
|
|
x3 += xxp *v_xt;
|
|
}
|
|
x0.s0 += x0.s1 + x0.s2 + x0.s3;
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3;
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3;
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3;
|
|
|
|
double py = lidy * x0.s0, sy = lidy*lidy;
|
|
int bheight = min(tileSize_height, TILE_SIZE/2);
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
|
{
|
|
m[9][lidy-bheight] = ((double)py) * sy; // m03
|
|
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12
|
|
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21
|
|
m[6][lidy-bheight] = x3.s0; // m30
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
|
m[3][lidy-bheight] = x2.s0; // m20
|
|
m[2][lidy-bheight] = py; // m01
|
|
m[1][lidy-bheight] = x1.s0; // m10
|
|
m[0][lidy-bheight] = x0.s0; // m00
|
|
}
|
|
|
|
else if(lidy < bheight)
|
|
{
|
|
lm[9] = ((double)py) * sy; // m03
|
|
lm[8] = ((double)x1.s0) * sy; // m12
|
|
lm[7] = ((double)x2.s0) * lidy; // m21
|
|
lm[6] = x3.s0; // m30
|
|
lm[5] = x0.s0 * sy; // m02
|
|
lm[4] = x1.s0 * lidy; // m11
|
|
lm[3] = x2.s0; // m20
|
|
lm[2] = py; // m01
|
|
lm[1] = x1.s0; // m10
|
|
lm[0] = x0.s0; // m00
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
|
{
|
|
if(lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
lm[i] = lm[i] + m[i][lidy];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
if(lidy >= j/2&&lidy < j)
|
|
for( int i = 0; i < 10; i++ )
|
|
m[i][lidy-j/2] = lm[i];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if(lidy == 0&&lidx == 0)
|
|
{
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] = (double)lm[mt];
|
|
if(binary)
|
|
{
|
|
double s = 1./255;
|
|
for( int mt = 0; mt < 10; mt++ )
|
|
mom[mt] *= s;
|
|
}
|
|
|
|
double xm = x * mom[0], ym = y * mom[0];
|
|
|
|
// accumulate moments computed in each tile
|
|
|
|
// + m00 ( = m00' )
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0];
|
|
|
|
// + m10 ( = m10' + x*m00' )
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
|
|
|
|
// + m01 ( = m01' + y*m00' )
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym);
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
|
}
|
|
}
|
|
//#endif
|