fix moments

This commit is contained in:
yao 2013-06-25 16:26:33 +08:00
parent dbdeff2069
commit 1227e00f3d
3 changed files with 290 additions and 297 deletions

View File

@ -16,7 +16,7 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, sen@multicorewareinc.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
blocky = size.height/TILE_SIZE;
else
blocky = size.height/TILE_SIZE + 1;
cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
oclMat dst_m(blocky * 10, blockx, CV_64FC1);
oclMat sum(1, 10, CV_64FC1);
int tile_width = std::min(size.width,TILE_SIZE);
int tile_height = std::min(size.height,TILE_SIZE);
size_t localThreads[3] = { tile_height, 1, 1};
@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
size_t localThreadss[3] = { 128, 1, 1};
size_t globalThreadss[3] = { 128, 1, 1};
@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
double* dstsum = new double[10];
memset(dstsum,0,10*sizeof(double));
openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
mom->m00 = dstsum[0];
mom->m10 = dstsum[1];
mom->m01 = dstsum[2];
mom->m20 = dstsum[3];
mom->m11 = dstsum[4];
mom->m02 = dstsum[5];
mom->m30 = dstsum[6];
mom->m21 = dstsum[7];
mom->m12 = dstsum[8];
mom->m03 = dstsum[9];
delete [] dstsum;
openCLSafeCall(clReleaseMemObject(sum));
openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
Mat dstsum(sum);
mom->m00 = dstsum.at<double>(0, 0);
mom->m10 = dstsum.at<double>(0, 1);
mom->m01 = dstsum.at<double>(0, 2);
mom->m20 = dstsum.at<double>(0, 3);
mom->m11 = dstsum.at<double>(0, 4);
mom->m02 = dstsum.at<double>(0, 5);
mom->m30 = dstsum.at<double>(0, 6);
mom->m21 = dstsum.at<double>(0, 7);
mom->m12 = dstsum.at<double>(0, 8);
mom->m03 = dstsum.at<double>(0, 9);
icvCompleteMomentState( mom );
}

View File

@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
sum[i] = dst_sum[i][0];
}
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
int depth, int cn, int coi, int binary, int TILE_SIZE)
{
uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16];
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
int x = wgidx*TILE_SIZE; // vector length of uchar
int kcn = (cn==2)?2:4;
int rstep = min(src_step, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if( tileSize_width < TILE_SIZE )
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
if( coi > 0 ) //channel of interest
for(int i = 0; i < tileSize_width; i += VLEN_C)
{
for(int j=0; j<VLEN_C; j++)
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
}
if( tileSize_width < TILE_SIZE )
for(int i = tileSize_width; i < rstep; i++ )
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
if( coi > 0 ) //channel of interest
for(int i = 0; i < tileSize_width; i += VLEN_C)
{
for(int j=0; j<VLEN_C; j++)
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
uchar16 zero = (uchar16)(0);
uchar16 full = (uchar16)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
F mom[10];
__local int m[10][128];
if(lidy == 0)
if(lidy < 128)
{
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
m[i][lidy]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
int lm[10] = {0};
int16 x0 = (int16)(0);
int16 x1 = (int16)(0);
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
}
}
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
ushort tmp_coi[8]; // get the coi data
ushort8 tmp[32];
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep; i++ )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_US)
{
for(int j=0; j<VLEN_US; j++)
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_US)
{
for(int j=0; j<VLEN_US; j++)
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
}
ushort8 zero = (ushort8)(0);
ushort8 full = (ushort8)(255);
if( binary )
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
F mom[10];
__local long m[10][128];
if(lidy == 0)
if(lidy < 128)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy == 0&&lidx == 0)
{
for(int mt = 0; mt < 10; mt++ )
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
}
}
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
short tmp_coi[8]; // get the coi data
short8 tmp[32];
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_S)
{
for(int j=0; j<VLEN_S; j++)
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_S)
{
for(int j=0; j<VLEN_S; j++)
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
}
short8 zero = (short8)(0);
short8 full = (short8)(255);
if( binary )
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
F mom[10];
__local long m[10][128];
if(lidy == 0)
if(lidy < 128)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
}
}
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
float tmp_coi[4]; // get the coi data
float4 tmp[64] ;
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
src_step /= sizeof(*src_data);
int rstep = min(src_step, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
int rstep = min(src_step/4, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
int maxIdx = mul24(src_rows, src_cols);
int yOff = (y+lidy)*src_step;
int index;
if(tileSize_width < TILE_SIZE && yOff < src_rows)
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
*(src_data+yOff+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
#pragma unroll
for(int j=0; j<4; j++)
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
index = yOff+(x+i+j)*kcn+coi-1;
if (index < maxIdx)
tmp_coi[j] = *(src_data+index);
else
tmp_coi[j] = 0;
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
else
for(int i=0; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
}
float4 zero = (float4)(0);
float4 full = (float4)(255);
if( binary )
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy == 0)
if(lidy < 128)
for(int i = 0; i < 10; i ++)
for(int j = 0; j < 128; j ++)
m[i][j] = 0;
m[i][lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
@ -729,185 +748,6 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
int dst_x_off = mad24(wgidy, dst_cols, wgidx);
int dst_off = 0;
int max_dst_index = 10 * blocky * get_global_size(1);
// + m00 ( = m00' )
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[0];
// + m10 ( = m10' + x*m00' )
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
F tmp_coi[4]; // get the coi data
F4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
F4 zero = (F4)(0);
F4 full = (F4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy == 0)
for(int i=0; i<10; i++)
for(int j=0; j<128; j++)
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = tmp[xt/VLEN_D];
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp *v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
@ -922,6 +762,164 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
F tmp_coi[4]; // get the coi data
F4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
}
F4 zero = (F4)(0);
F4 full = (F4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = tmp[xt/VLEN_D];
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp *v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy < j)

View File

@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
{
if(test_contours)
{
Mat src = imread( workdir + "../cpp/pic3.png", 1 );
Mat src_gray, canny_output;
cvtColor( src, src_gray, CV_BGR2GRAY );
Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
ASSERT_FALSE(src.empty());
Mat canny_output;
vector<vector<Point> > contours;
vector<Vec4i> hierarchy;
Canny( src_gray, canny_output, 100, 200, 3 );
Canny( src, canny_output, 100, 200, 3 );
findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
for( size_t i = 0; i < contours.size(); i++ )
{