diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index d6baba207c..cb16fb136d 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -16,7 +16,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Sen Liu, sen@multicorewareinc.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) blocky = size.height/TILE_SIZE; else blocky = size.height/TILE_SIZE + 1; - cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1); - cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double)); + oclMat dst_m(blocky * 10, blockx, CV_64FC1); + oclMat sum(1, 10, CV_64FC1); int tile_width = std::min(size.width,TILE_SIZE); int tile_height = std::min(size.height,TILE_SIZE); size_t localThreads[3] = { tile_height, 1, 1}; @@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&type )); args.push_back( make_pair( sizeof(cl_int) , (void *)&depth )); args.push_back( make_pair( sizeof(cl_int) , (void *)&cn )); args.push_back( make_pair( sizeof(cl_int) , (void *)&coi )); args.push_back( make_pair( sizeof(cl_int) , (void *)&binary )); args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); size_t localThreadss[3] = { 128, 1, 1}; size_t globalThreadss[3] = { 128, 1, 1}; @@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum )); + args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data )); args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); - openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); - double* dstsum = new double[10]; - memset(dstsum,0,10*sizeof(double)); - openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double)); - mom->m00 = dstsum[0]; - mom->m10 = dstsum[1]; - mom->m01 = dstsum[2]; - mom->m20 = dstsum[3]; - mom->m11 = dstsum[4]; - mom->m02 = dstsum[5]; - mom->m30 = dstsum[6]; - mom->m21 = dstsum[7]; - mom->m12 = dstsum[8]; - mom->m03 = dstsum[9]; - delete [] dstsum; - openCLSafeCall(clReleaseMemObject(sum)); + openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); + + Mat dstsum(sum); + mom->m00 = dstsum.at(0, 0); + mom->m10 = dstsum.at(0, 1); + mom->m01 = dstsum.at(0, 2); + mom->m20 = dstsum.at(0, 3); + mom->m11 = dstsum.at(0, 4); + mom->m02 = dstsum.at(0, 5); + mom->m30 = dstsum.at(0, 6); + mom->m21 = dstsum.at(0, 7); + mom->m12 = dstsum.at(0, 8); + mom->m03 = dstsum.at(0, 9); + icvCompleteMomentState( mom ); } diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index 2378f4f849..71313017a9 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt sum[i] = dst_sum[i][0]; } -__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, +__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, __global F* dst_m, int dst_cols, int dst_step, int blocky, - int type, int depth, int cn, int coi, int binary, int TILE_SIZE) + int depth, int cn, int coi, int binary, int TILE_SIZE) { uchar tmp_coi[16]; // get the coi data uchar16 tmp[16]; @@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col int x = wgidx*TILE_SIZE; // vector length of uchar int kcn = (cn==2)?2:4; int rstep = min(src_step, TILE_SIZE); - tileSize_height = min(TILE_SIZE, src_rows - y); - tileSize_width = min(TILE_SIZE, src_cols - x); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols - x); + + if ( y+lidy < src_rows ) + { + if( tileSize_width < TILE_SIZE ) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; + + if( coi > 0 ) //channel of interest + for(int i = 0; i < tileSize_width; i += VLEN_C) + { + for(int j=0; j 0 ) //channel of interest - for(int i = 0; i < tileSize_width; i += VLEN_C) - { - for(int j=0; j TILE_SIZE && tileSize_width < TILE_SIZE) - for(int i=tileSize_width; i < rstep; i++ ) - *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_US) - { - for(int j=0; j TILE_SIZE && tileSize_width < TILE_SIZE) + for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_US) + { + for(int j=0; j= 1; j = j/2 ) { if(lidy < j) for( int i = 0; i < 10; i++ ) lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { if(lidy >= j/2&&lidy < j) for( int i = 0; i < 10; i++ ) m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); } + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy == 0&&lidx == 0) { for(int mt = 0; mt < 10; mt++ ) @@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col } } -__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, +__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, __global F* dst_m, int dst_cols, int dst_step, int blocky, - int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) + int depth, int cn, int coi, int binary, const int TILE_SIZE) { short tmp_coi[8]; // get the coi data short8 tmp[32]; @@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols int x = wgidx*TILE_SIZE; // real X index of pixel int kcn = (cn==2)?2:4; int rstep = min(src_step/2, TILE_SIZE); - tileSize_height = min(TILE_SIZE, src_rows - y); - tileSize_width = min(TILE_SIZE, src_cols -x); - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep; i++ ) - *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_S) - { - for(int j=0; j 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_S) + { + for(int j=0; j 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_F) - { -#pragma unroll - for(int j=0; j<4; j++) + + if ( y+lidy < src_rows ) + { + if(tileSize_width < TILE_SIZE) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_F) { - index = yOff+(x+i+j)*kcn+coi-1; - if (index < maxIdx) - tmp_coi[j] = *(src_data+index); - else - tmp_coi[j] = 0; + for(int j=0; j<4; j++) + tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); + tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); } - tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); - } - else - for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F) - tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i)); + else + for(int i=0; i < tileSize_width; i+=VLEN_F) + tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); + } + float4 zero = (float4)(0); float4 full = (float4)(255); if( binary ) @@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; F mom[10]; __local F m[10][128]; - if(lidy == 0) + if(lidy < 128) for(int i = 0; i < 10; i ++) - for(int j = 0; j < 128; j ++) - m[i][j] = 0; + m[i][lidy] = 0; barrier(CLK_LOCAL_MEM_FENCE); F lm[10] = {0}; F4 x0 = (F4)(0); @@ -729,185 +748,6 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols m[0][lidy-bheight] = x0.s0; // m00 } - else if(lidy < bheight) - { - lm[9] = ((F)py) * sy; // m03 - lm[8] = ((F)x1.s0) * sy; // m12 - lm[7] = ((F)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x * mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - int dst_x_off = mad24(wgidy, dst_cols, wgidx); - int dst_off = 0; - int max_dst_index = 10 * blocky * get_global_size(1); - - // + m00 ( = m00' ) - dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off); - if (dst_off < max_dst_index) - *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - F tmp_coi[4]; // get the coi data - F4 tmp[64]; - int VLEN_D = 4; // length of vetor - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/8, TILE_SIZE); - tileSize_height = min(TILE_SIZE, src_rows - y); - tileSize_width = min(TILE_SIZE, src_cols - x); - - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep; i++ ) - *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_D) - { - for(int j=0; j<4; j++) - tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); - tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); - } - else - for(int i=0; i < tileSize_width; i+=VLEN_D) - tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); - F4 zero = (F4)(0); - F4 full = (F4)(255); - if( binary ) - for(int i=0; i < tileSize_width; i+=VLEN_D) - tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; - F mom[10]; - __local F m[10][128]; - if(lidy == 0) - for(int i=0; i<10; i++) - for(int j=0; j<128; j++) - m[i][j]=0; - barrier(CLK_LOCAL_MEM_FENCE); - F lm[10] = {0}; - F4 x0 = (F4)(0); - F4 x1 = (F4)(0); - F4 x2 = (F4)(0); - F4 x3 = (F4)(0); - for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D ) - { - F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); - F4 p = tmp[xt/VLEN_D]; - F4 xp = v_xt * p, xxp = xp * v_xt; - x0 += p; - x1 += xp; - x2 += xxp; - x3 += xxp *v_xt; - } - x0.s0 += x0.s1 + x0.s2 + x0.s3; - x1.s0 += x1.s1 + x1.s2 + x1.s3; - x2.s0 += x2.s1 + x2.s2 + x2.s3; - x3.s0 += x3.s1 + x3.s2 + x3.s3; - - F py = lidy * x0.s0, sy = lidy*lidy; - int bheight = min(tileSize_height, TILE_SIZE/2); - if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((F)py) * sy; // m03 - m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) { lm[9] = ((F)py) * sy; // m03 @@ -922,6 +762,164 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in lm[0] = x0.s0; // m00 } barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) + { + if(lidy < j) + for( int i = 0; i < 10; i++ ) + lm[i] = lm[i] + m[i][lidy]; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidy >= j/2&&lidy < j) + for( int i = 0; i < 10; i++ ) + m[i][lidy-j/2] = lm[i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if(lidy == 0&&lidx == 0) + { + for( int mt = 0; mt < 10; mt++ ) + mom[mt] = (F)lm[mt]; + if(binary) + { + F s = 1./255; + for( int mt = 0; mt < 10; mt++ ) + mom[mt] *= s; + } + + F xm = x * mom[0], ym = y * mom[0]; + + // accumulate moments computed in each tile + dst_step /= sizeof(F); + + // + m00 ( = m00' ) + *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + + // + m10 ( = m10' + x*m00' ) + *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + + // + m01 ( = m01' + y*m00' ) + *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + + // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) + *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + + // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) + *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + + // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) + *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + + // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) + *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + + // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') + *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + + // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') + *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + + // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) + *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + } +} + +__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, + __global F* dst_m, + int dst_cols, int dst_step, int blocky, + int depth, int cn, int coi, int binary, const int TILE_SIZE) +{ + F tmp_coi[4]; // get the coi data + F4 tmp[64]; + int VLEN_D = 4; // length of vetor + int gidy = get_global_id(0); + int gidx = get_global_id(1); + int wgidy = get_group_id(0); + int wgidx = get_group_id(1); + int lidy = get_local_id(0); + int lidx = get_local_id(1); + int y = wgidy*TILE_SIZE; // real Y index of pixel + int x = wgidx*TILE_SIZE; // real X index of pixel + int kcn = (cn==2)?2:4; + int rstep = min(src_step/8, TILE_SIZE); + int tileSize_height = min(TILE_SIZE, src_rows - y); + int tileSize_width = min(TILE_SIZE, src_cols - x); + + if ( y+lidy < src_rows ) + { + if(tileSize_width < TILE_SIZE) + for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) + *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; + if( coi > 0 ) + for(int i=0; i < tileSize_width; i+=VLEN_D) + { + for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) + { + m[9][lidy-bheight] = ((F)py) * sy; // m03 + m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 + m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 + m[6][lidy-bheight] = x3.s0; // m30 + m[5][lidy-bheight] = x0.s0 * sy; // m02 + m[4][lidy-bheight] = x1.s0 * lidy; // m11 + m[3][lidy-bheight] = x2.s0; // m20 + m[2][lidy-bheight] = py; // m01 + m[1][lidy-bheight] = x1.s0; // m10 + m[0][lidy-bheight] = x0.s0; // m00 + } + else if(lidy < bheight) + { + lm[9] = ((F)py) * sy; // m03 + lm[8] = ((F)x1.s0) * sy; // m12 + lm[7] = ((F)x2.s0) * lidy; // m21 + lm[6] = x3.s0; // m30 + lm[5] = x0.s0 * sy; // m02 + lm[4] = x1.s0 * lidy; // m11 + lm[3] = x2.s0; // m20 + lm[2] = py; // m01 + lm[1] = x1.s0; // m10 + lm[0] = x0.s0; // m00 + } + barrier(CLK_LOCAL_MEM_FENCE); + for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) { if(lidy < j) diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp index 98c66def31..86f4779d68 100644 --- a/modules/ocl/test/test_moments.cpp +++ b/modules/ocl/test/test_moments.cpp @@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat) { if(test_contours) { - Mat src = imread( workdir + "../cpp/pic3.png", 1 ); - Mat src_gray, canny_output; - cvtColor( src, src_gray, CV_BGR2GRAY ); + Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE ); + ASSERT_FALSE(src.empty()); + Mat canny_output; vector > contours; vector hierarchy; - Canny( src_gray, canny_output, 100, 200, 3 ); + Canny( src, canny_output, 100, 200, 3 ); findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) ); for( size_t i = 0; i < contours.size(); i++ ) {