mirror of
https://github.com/opencv/opencv.git
synced 2024-11-28 13:10:12 +08:00
Merge pull request #1045 from bitwangyaoyao:2.4_fix
This commit is contained in:
commit
bf6b119a3f
@ -44,43 +44,21 @@
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
{
|
||||
vector<ocl::Info> oclinfo;
|
||||
int num_devices = getDevice(oclinfo);
|
||||
|
||||
if (num_devices < 1)
|
||||
{
|
||||
cerr << "no device found\n";
|
||||
return -1;
|
||||
}
|
||||
// set this to overwrite binary cache every time the test starts
|
||||
ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
|
||||
|
||||
int devidx = 0;
|
||||
|
||||
for (size_t i = 0; i < oclinfo.size(); i++)
|
||||
{
|
||||
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
|
||||
{
|
||||
printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
redirectError(cvErrorCallback);
|
||||
|
||||
const char *keys =
|
||||
"{ h | help | false | print help message }"
|
||||
"{ f | filter | | filter for test }"
|
||||
"{ w | workdir | | set working directory }"
|
||||
"{ l | list | false | show all tests }"
|
||||
"{ d | device | 0 | device id }"
|
||||
"{ c | cpu_ocl | false | use cpu as ocl device}"
|
||||
"{ i | iters | 10 | iteration count }"
|
||||
"{ m | warmup | 1 | gpu warm up iteration count}"
|
||||
"{ t | xtop | 1.1 | xfactor top boundary}"
|
||||
"{ b | xbottom | 0.9 | xfactor bottom boundary}"
|
||||
"{ t | xtop | 1.1 | xfactor top boundary}"
|
||||
"{ b | xbottom | 0.9 | xfactor bottom boundary}"
|
||||
"{ v | verify | false | only run gpu once to verify if problems occur}";
|
||||
|
||||
redirectError(cvErrorCallback);
|
||||
CommandLineParser cmd(argc, argv, keys);
|
||||
|
||||
if (cmd.get<bool>("help"))
|
||||
{
|
||||
cout << "Avaible options:" << endl;
|
||||
@ -88,14 +66,40 @@ int main(int argc, const char *argv[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
int device = cmd.get<int>("device");
|
||||
// get ocl devices
|
||||
bool use_cpu = cmd.get<bool>("c");
|
||||
vector<ocl::Info> oclinfo;
|
||||
int num_devices = 0;
|
||||
if(use_cpu)
|
||||
num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
|
||||
else
|
||||
num_devices = getDevice(oclinfo);
|
||||
if (num_devices < 1)
|
||||
{
|
||||
cerr << "no device found\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
// show device info
|
||||
int devidx = 0;
|
||||
for (size_t i = 0; i < oclinfo.size(); i++)
|
||||
{
|
||||
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
|
||||
{
|
||||
cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
|
||||
}
|
||||
}
|
||||
|
||||
int device = cmd.get<int>("device");
|
||||
if (device < 0 || device >= num_devices)
|
||||
{
|
||||
cerr << "Invalid device ID" << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// set this to overwrite binary cache every time the test starts
|
||||
ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
|
||||
|
||||
if (cmd.get<bool>("verify"))
|
||||
{
|
||||
TestSystem::instance().setNumIters(1);
|
||||
@ -104,7 +108,6 @@ int main(int argc, const char *argv[])
|
||||
}
|
||||
|
||||
devidx = 0;
|
||||
|
||||
for (size_t i = 0; i < oclinfo.size(); i++)
|
||||
{
|
||||
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
|
||||
@ -113,7 +116,7 @@ int main(int argc, const char *argv[])
|
||||
{
|
||||
ocl::setDevice(oclinfo[i], (int)j);
|
||||
TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
|
||||
printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
|
||||
cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
|
||||
goto END_DEV;
|
||||
}
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ namespace cv
|
||||
cl_image_format format;
|
||||
int err;
|
||||
int depth = mat.depth();
|
||||
int channels = mat.channels();
|
||||
int channels = mat.oclchannels();
|
||||
|
||||
switch(depth)
|
||||
{
|
||||
|
@ -16,7 +16,7 @@
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Sen Liu, sen@multicorewareinc.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
|
||||
blocky = size.height/TILE_SIZE;
|
||||
else
|
||||
blocky = size.height/TILE_SIZE + 1;
|
||||
cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
|
||||
cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
|
||||
oclMat dst_m(blocky * 10, blockx, CV_64FC1);
|
||||
oclMat sum(1, 10, CV_64FC1);
|
||||
int tile_width = std::min(size.width,TILE_SIZE);
|
||||
int tile_height = std::min(size.height,TILE_SIZE);
|
||||
size_t localThreads[3] = { tile_height, 1, 1};
|
||||
@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
|
||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
|
||||
openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
|
||||
openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
|
||||
|
||||
size_t localThreadss[3] = { 128, 1, 1};
|
||||
size_t globalThreadss[3] = { 128, 1, 1};
|
||||
@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
|
||||
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
|
||||
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
|
||||
openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
|
||||
double* dstsum = new double[10];
|
||||
memset(dstsum,0,10*sizeof(double));
|
||||
openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
|
||||
mom->m00 = dstsum[0];
|
||||
mom->m10 = dstsum[1];
|
||||
mom->m01 = dstsum[2];
|
||||
mom->m20 = dstsum[3];
|
||||
mom->m11 = dstsum[4];
|
||||
mom->m02 = dstsum[5];
|
||||
mom->m30 = dstsum[6];
|
||||
mom->m21 = dstsum[7];
|
||||
mom->m12 = dstsum[8];
|
||||
mom->m03 = dstsum[9];
|
||||
delete [] dstsum;
|
||||
openCLSafeCall(clReleaseMemObject(sum));
|
||||
openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
|
||||
|
||||
Mat dstsum(sum);
|
||||
mom->m00 = dstsum.at<double>(0, 0);
|
||||
mom->m10 = dstsum.at<double>(0, 1);
|
||||
mom->m01 = dstsum.at<double>(0, 2);
|
||||
mom->m20 = dstsum.at<double>(0, 3);
|
||||
mom->m11 = dstsum.at<double>(0, 4);
|
||||
mom->m02 = dstsum.at<double>(0, 5);
|
||||
mom->m30 = dstsum.at<double>(0, 6);
|
||||
mom->m21 = dstsum.at<double>(0, 7);
|
||||
mom->m12 = dstsum.at<double>(0, 8);
|
||||
mom->m03 = dstsum.at<double>(0, 9);
|
||||
|
||||
icvCompleteMomentState( mom );
|
||||
}
|
||||
|
||||
|
@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
|
||||
sum[i] = dst_sum[i][0];
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
||||
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
|
||||
int depth, int cn, int coi, int binary, int TILE_SIZE)
|
||||
{
|
||||
uchar tmp_coi[16]; // get the coi data
|
||||
uchar16 tmp[16];
|
||||
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
|
||||
int x = wgidx*TILE_SIZE; // vector length of uchar
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols - x);
|
||||
int tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
int tileSize_width = min(TILE_SIZE, src_cols - x);
|
||||
|
||||
if ( y+lidy < src_rows )
|
||||
{
|
||||
if( tileSize_width < TILE_SIZE )
|
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
|
||||
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
|
||||
|
||||
if( coi > 0 ) //channel of interest
|
||||
for(int i = 0; i < tileSize_width; i += VLEN_C)
|
||||
{
|
||||
for(int j=0; j<VLEN_C; j++)
|
||||
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
|
||||
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_C)
|
||||
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
|
||||
}
|
||||
|
||||
if( tileSize_width < TILE_SIZE )
|
||||
for(int i = tileSize_width; i < rstep; i++ )
|
||||
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
|
||||
if( coi > 0 ) //channel of interest
|
||||
for(int i = 0; i < tileSize_width; i += VLEN_C)
|
||||
{
|
||||
for(int j=0; j<VLEN_C; j++)
|
||||
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
|
||||
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_C)
|
||||
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
|
||||
uchar16 zero = (uchar16)(0);
|
||||
uchar16 full = (uchar16)(255);
|
||||
if( binary )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_C)
|
||||
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
|
||||
|
||||
F mom[10];
|
||||
__local int m[10][128];
|
||||
if(lidy == 0)
|
||||
if(lidy < 128)
|
||||
{
|
||||
for(int i=0; i<10; i++)
|
||||
for(int j=0; j<128; j++)
|
||||
m[i][j]=0;
|
||||
m[i][lidy]=0;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int lm[10] = {0};
|
||||
int16 x0 = (int16)(0);
|
||||
int16 x1 = (int16)(0);
|
||||
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
|
||||
m[i][lidy-j/2] = lm[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if(lidy == 0&&lidx == 0)
|
||||
{
|
||||
for( int mt = 0; mt < 10; mt++ )
|
||||
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
||||
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
{
|
||||
ushort tmp_coi[8]; // get the coi data
|
||||
ushort8 tmp[32];
|
||||
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step/2, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
|
||||
for(int i=tileSize_width; i < rstep; i++ )
|
||||
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
||||
{
|
||||
for(int j=0; j<VLEN_US; j++)
|
||||
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
||||
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
|
||||
int tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
int tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
|
||||
if ( y+lidy < src_rows )
|
||||
{
|
||||
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
|
||||
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
|
||||
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
||||
{
|
||||
for(int j=0; j<VLEN_US; j++)
|
||||
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_US)
|
||||
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
|
||||
}
|
||||
|
||||
ushort8 zero = (ushort8)(0);
|
||||
ushort8 full = (ushort8)(255);
|
||||
if( binary )
|
||||
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
|
||||
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
|
||||
F mom[10];
|
||||
__local long m[10][128];
|
||||
if(lidy == 0)
|
||||
if(lidy < 128)
|
||||
for(int i=0; i<10; i++)
|
||||
for(int j=0; j<128; j++)
|
||||
m[i][j]=0;
|
||||
m[i][lidy]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
long lm[10] = {0};
|
||||
int8 x0 = (int8)(0);
|
||||
int8 x1 = (int8)(0);
|
||||
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
|
||||
lm[0] = x0.s0; // m00
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
||||
{
|
||||
if(lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
lm[i] = lm[i] + m[i][lidy];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
||||
{
|
||||
if(lidy >= j/2&&lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
m[i][lidy-j/2] = lm[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(lidy == 0&&lidx == 0)
|
||||
{
|
||||
for(int mt = 0; mt < 10; mt++ )
|
||||
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
||||
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
{
|
||||
short tmp_coi[8]; // get the coi data
|
||||
short8 tmp[32];
|
||||
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step/2, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep; i++ )
|
||||
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
||||
{
|
||||
for(int j=0; j<VLEN_S; j++)
|
||||
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
||||
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
|
||||
int tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
int tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
|
||||
if ( y+lidy < src_rows )
|
||||
{
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
|
||||
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
||||
{
|
||||
for(int j=0; j<VLEN_S; j++)
|
||||
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_S)
|
||||
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
|
||||
}
|
||||
|
||||
short8 zero = (short8)(0);
|
||||
short8 full = (short8)(255);
|
||||
if( binary )
|
||||
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
|
||||
|
||||
F mom[10];
|
||||
__local long m[10][128];
|
||||
if(lidy == 0)
|
||||
if(lidy < 128)
|
||||
for(int i=0; i<10; i++)
|
||||
for(int j=0; j<128; j++)
|
||||
m[i][j]=0;
|
||||
m[i][lidy]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
long lm[10] = {0};
|
||||
int8 x0 = (int8)(0);
|
||||
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
||||
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
{
|
||||
float tmp_coi[4]; // get the coi data
|
||||
float4 tmp[64] ;
|
||||
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
|
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
src_step /= sizeof(*src_data);
|
||||
int rstep = min(src_step, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
int rstep = min(src_step/4, TILE_SIZE);
|
||||
int tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
int tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
int maxIdx = mul24(src_rows, src_cols);
|
||||
int yOff = (y+lidy)*src_step;
|
||||
int index;
|
||||
if(tileSize_width < TILE_SIZE && yOff < src_rows)
|
||||
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
|
||||
*(src_data+yOff+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j=0; j<4; j++)
|
||||
|
||||
if ( y+lidy < src_rows )
|
||||
{
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
|
||||
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
||||
{
|
||||
index = yOff+(x+i+j)*kcn+coi-1;
|
||||
if (index < maxIdx)
|
||||
tmp_coi[j] = *(src_data+index);
|
||||
else
|
||||
tmp_coi[j] = 0;
|
||||
for(int j=0; j<4; j++)
|
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
||||
}
|
||||
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
|
||||
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
||||
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
|
||||
}
|
||||
|
||||
float4 zero = (float4)(0);
|
||||
float4 full = (float4)(255);
|
||||
if( binary )
|
||||
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
|
||||
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
|
||||
F mom[10];
|
||||
__local F m[10][128];
|
||||
if(lidy == 0)
|
||||
if(lidy < 128)
|
||||
for(int i = 0; i < 10; i ++)
|
||||
for(int j = 0; j < 128; j ++)
|
||||
m[i][j] = 0;
|
||||
m[i][lidy] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
F lm[10] = {0};
|
||||
F4 x0 = (F4)(0);
|
||||
@ -729,185 +748,6 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
|
||||
m[0][lidy-bheight] = x0.s0; // m00
|
||||
}
|
||||
|
||||
else if(lidy < bheight)
|
||||
{
|
||||
lm[9] = ((F)py) * sy; // m03
|
||||
lm[8] = ((F)x1.s0) * sy; // m12
|
||||
lm[7] = ((F)x2.s0) * lidy; // m21
|
||||
lm[6] = x3.s0; // m30
|
||||
lm[5] = x0.s0 * sy; // m02
|
||||
lm[4] = x1.s0 * lidy; // m11
|
||||
lm[3] = x2.s0; // m20
|
||||
lm[2] = py; // m01
|
||||
lm[1] = x1.s0; // m10
|
||||
lm[0] = x0.s0; // m00
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
||||
{
|
||||
if(lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
lm[i] = lm[i] + m[i][lidy];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lidy >= j/2&&lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
m[i][lidy-j/2] = lm[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if(lidy == 0&&lidx == 0)
|
||||
{
|
||||
for( int mt = 0; mt < 10; mt++ )
|
||||
mom[mt] = (F)lm[mt];
|
||||
if(binary)
|
||||
{
|
||||
F s = 1./255;
|
||||
for( int mt = 0; mt < 10; mt++ )
|
||||
mom[mt] *= s;
|
||||
}
|
||||
|
||||
F xm = x * mom[0], ym = y * mom[0];
|
||||
|
||||
// accumulate moments computed in each tile
|
||||
dst_step /= sizeof(F);
|
||||
|
||||
int dst_x_off = mad24(wgidy, dst_cols, wgidx);
|
||||
int dst_off = 0;
|
||||
int max_dst_index = 10 * blocky * get_global_size(1);
|
||||
|
||||
// + m00 ( = m00' )
|
||||
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[0];
|
||||
|
||||
// + m10 ( = m10' + x*m00' )
|
||||
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[1] + xm;
|
||||
|
||||
// + m01 ( = m01' + y*m00' )
|
||||
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[2] + ym;
|
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
||||
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
|
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
||||
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
||||
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
|
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
||||
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
||||
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
||||
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
||||
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
{
|
||||
F tmp_coi[4]; // get the coi data
|
||||
F4 tmp[64];
|
||||
int VLEN_D = 4; // length of vetor
|
||||
int gidy = get_global_id(0);
|
||||
int gidx = get_global_id(1);
|
||||
int wgidy = get_group_id(0);
|
||||
int wgidx = get_group_id(1);
|
||||
int lidy = get_local_id(0);
|
||||
int lidx = get_local_id(1);
|
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step/8, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols - x);
|
||||
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep; i++ )
|
||||
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
||||
{
|
||||
for(int j=0; j<4; j++)
|
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
||||
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
|
||||
F4 zero = (F4)(0);
|
||||
F4 full = (F4)(255);
|
||||
if( binary )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
||||
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
|
||||
F mom[10];
|
||||
__local F m[10][128];
|
||||
if(lidy == 0)
|
||||
for(int i=0; i<10; i++)
|
||||
for(int j=0; j<128; j++)
|
||||
m[i][j]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
F lm[10] = {0};
|
||||
F4 x0 = (F4)(0);
|
||||
F4 x1 = (F4)(0);
|
||||
F4 x2 = (F4)(0);
|
||||
F4 x3 = (F4)(0);
|
||||
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
|
||||
{
|
||||
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
|
||||
F4 p = tmp[xt/VLEN_D];
|
||||
F4 xp = v_xt * p, xxp = xp * v_xt;
|
||||
x0 += p;
|
||||
x1 += xp;
|
||||
x2 += xxp;
|
||||
x3 += xxp *v_xt;
|
||||
}
|
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3;
|
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3;
|
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3;
|
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3;
|
||||
|
||||
F py = lidy * x0.s0, sy = lidy*lidy;
|
||||
int bheight = min(tileSize_height, TILE_SIZE/2);
|
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
||||
{
|
||||
m[9][lidy-bheight] = ((F)py) * sy; // m03
|
||||
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
|
||||
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
|
||||
m[6][lidy-bheight] = x3.s0; // m30
|
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
||||
m[3][lidy-bheight] = x2.s0; // m20
|
||||
m[2][lidy-bheight] = py; // m01
|
||||
m[1][lidy-bheight] = x1.s0; // m10
|
||||
m[0][lidy-bheight] = x0.s0; // m00
|
||||
}
|
||||
|
||||
else if(lidy < bheight)
|
||||
{
|
||||
lm[9] = ((F)py) * sy; // m03
|
||||
@ -922,6 +762,164 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
|
||||
lm[0] = x0.s0; // m00
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
||||
{
|
||||
if(lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
lm[i] = lm[i] + m[i][lidy];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lidy >= j/2&&lidy < j)
|
||||
for( int i = 0; i < 10; i++ )
|
||||
m[i][lidy-j/2] = lm[i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if(lidy == 0&&lidx == 0)
|
||||
{
|
||||
for( int mt = 0; mt < 10; mt++ )
|
||||
mom[mt] = (F)lm[mt];
|
||||
if(binary)
|
||||
{
|
||||
F s = 1./255;
|
||||
for( int mt = 0; mt < 10; mt++ )
|
||||
mom[mt] *= s;
|
||||
}
|
||||
|
||||
F xm = x * mom[0], ym = y * mom[0];
|
||||
|
||||
// accumulate moments computed in each tile
|
||||
dst_step /= sizeof(F);
|
||||
|
||||
// + m00 ( = m00' )
|
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
|
||||
|
||||
// + m10 ( = m10' + x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
|
||||
|
||||
// + m01 ( = m01' + y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
|
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
|
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
|
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
|
||||
__global F* dst_m,
|
||||
int dst_cols, int dst_step, int blocky,
|
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE)
|
||||
{
|
||||
F tmp_coi[4]; // get the coi data
|
||||
F4 tmp[64];
|
||||
int VLEN_D = 4; // length of vetor
|
||||
int gidy = get_global_id(0);
|
||||
int gidx = get_global_id(1);
|
||||
int wgidy = get_group_id(0);
|
||||
int wgidx = get_group_id(1);
|
||||
int lidy = get_local_id(0);
|
||||
int lidx = get_local_id(1);
|
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step/8, TILE_SIZE);
|
||||
int tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
int tileSize_width = min(TILE_SIZE, src_cols - x);
|
||||
|
||||
if ( y+lidy < src_rows )
|
||||
{
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
|
||||
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
||||
{
|
||||
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
|
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
|
||||
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
|
||||
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
|
||||
}
|
||||
|
||||
F4 zero = (F4)(0);
|
||||
F4 full = (F4)(255);
|
||||
if( binary )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_D)
|
||||
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
|
||||
F mom[10];
|
||||
__local F m[10][128];
|
||||
if(lidy < 128)
|
||||
for(int i=0; i<10; i++)
|
||||
m[i][lidy]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
F lm[10] = {0};
|
||||
F4 x0 = (F4)(0);
|
||||
F4 x1 = (F4)(0);
|
||||
F4 x2 = (F4)(0);
|
||||
F4 x3 = (F4)(0);
|
||||
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
|
||||
{
|
||||
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
|
||||
F4 p = tmp[xt/VLEN_D];
|
||||
F4 xp = v_xt * p, xxp = xp * v_xt;
|
||||
x0 += p;
|
||||
x1 += xp;
|
||||
x2 += xxp;
|
||||
x3 += xxp *v_xt;
|
||||
}
|
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3;
|
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3;
|
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3;
|
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3;
|
||||
|
||||
F py = lidy * x0.s0, sy = lidy*lidy;
|
||||
int bheight = min(tileSize_height, TILE_SIZE/2);
|
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
|
||||
{
|
||||
m[9][lidy-bheight] = ((F)py) * sy; // m03
|
||||
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
|
||||
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
|
||||
m[6][lidy-bheight] = x3.s0; // m30
|
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02
|
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11
|
||||
m[3][lidy-bheight] = x2.s0; // m20
|
||||
m[2][lidy-bheight] = py; // m01
|
||||
m[1][lidy-bheight] = x1.s0; // m10
|
||||
m[0][lidy-bheight] = x0.s0; // m00
|
||||
}
|
||||
else if(lidy < bheight)
|
||||
{
|
||||
lm[9] = ((F)py) * sy; // m03
|
||||
lm[8] = ((F)x1.s0) * sy; // m12
|
||||
lm[7] = ((F)x2.s0) * lidy; // m21
|
||||
lm[6] = x3.s0; // m30
|
||||
lm[5] = x0.s0 * sy; // m02
|
||||
lm[4] = x1.s0 * lidy; // m11
|
||||
lm[3] = x2.s0; // m20
|
||||
lm[2] = py; // m01
|
||||
lm[1] = x1.s0; // m10
|
||||
lm[0] = x0.s0; // m00
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
|
||||
{
|
||||
if(lidy < j)
|
||||
|
@ -162,8 +162,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
int y_tex;
|
||||
int x_tex = X - radius;
|
||||
|
||||
if (x_tex >= cwidth)
|
||||
return;
|
||||
//if (x_tex >= cwidth)
|
||||
// return;
|
||||
|
||||
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
|
||||
{
|
||||
|
@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
|
||||
{
|
||||
if(test_contours)
|
||||
{
|
||||
Mat src = imread( workdir + "../cpp/pic3.png", 1 );
|
||||
Mat src_gray, canny_output;
|
||||
cvtColor( src, src_gray, CV_BGR2GRAY );
|
||||
Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
|
||||
ASSERT_FALSE(src.empty());
|
||||
Mat canny_output;
|
||||
vector<vector<Point> > contours;
|
||||
vector<Vec4i> hierarchy;
|
||||
Canny( src_gray, canny_output, 100, 200, 3 );
|
||||
Canny( src, canny_output, 100, 200, 3 );
|
||||
findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
|
||||
for( size_t i = 0; i < contours.size(); i++ )
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user