mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 22:44:02 +08:00
improved performance of moments (on 720p or larger images)
This commit is contained in:
parent
e97dd57dc7
commit
48c7378c8f
@ -365,7 +365,7 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1
|
||||
|
||||
static bool ocl_moments( InputArray _src, Moments& m)
|
||||
{
|
||||
const int TILE_SIZE = 16;
|
||||
const int TILE_SIZE = 32;
|
||||
const int K = 10;
|
||||
ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE));
|
||||
if( k.empty() )
|
||||
@ -378,10 +378,10 @@ static bool ocl_moments( InputArray _src, Moments& m)
|
||||
int ntiles = xtiles*ytiles;
|
||||
UMat umbuf(1, ntiles*K, CV_32S);
|
||||
|
||||
size_t globalsize[] = {xtiles, ytiles};
|
||||
size_t globalsize[] = {xtiles, sz.height}, localsize[] = {1, TILE_SIZE};
|
||||
bool ok = k.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::PtrWriteOnly(umbuf),
|
||||
xtiles).run(2, globalsize, 0, true);
|
||||
xtiles).run(2, globalsize, localsize, true);
|
||||
if(!ok)
|
||||
return false;
|
||||
Mat mbuf = umbuf.getMat(ACCESS_READ);
|
||||
|
@ -1,32 +1,31 @@
|
||||
/* See LICENSE file in the root OpenCV directory */
|
||||
|
||||
#if TILE_SIZE > 16
|
||||
#error "TILE SIZE should be <= 16"
|
||||
#if TILE_SIZE != 32
|
||||
#error "TILE SIZE should be 32"
|
||||
#endif
|
||||
|
||||
__kernel void moments(__global const uchar* src, int src_step, int src_offset,
|
||||
int src_rows, int src_cols, __global int* mom0, int xtiles)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int x_min = x*TILE_SIZE;
|
||||
int y_min = y*TILE_SIZE;
|
||||
int x0 = get_global_id(0);
|
||||
int y0 = get_group_id(1);
|
||||
int x, y = get_local_id(1);
|
||||
int x_min = x0*TILE_SIZE;
|
||||
int ypix = y0*TILE_SIZE + y;
|
||||
__local int mom[TILE_SIZE][10];
|
||||
|
||||
if( x_min < src_cols && y_min < src_rows )
|
||||
if( x_min < src_cols && y0*TILE_SIZE < src_rows )
|
||||
{
|
||||
int x_max = min(src_cols - x_min, TILE_SIZE);
|
||||
int y_max = min(src_rows - y_min, TILE_SIZE);
|
||||
int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0;
|
||||
__global const uchar* ptr = src + src_offset + y_min*src_step + x_min;
|
||||
__global int* mom = mom0 + (xtiles*y + x)*10;
|
||||
x = x_max & -4;
|
||||
|
||||
for( y = 0; y < y_max; y++, ptr += src_step )
|
||||
if( ypix < src_rows )
|
||||
{
|
||||
int x_max = min(src_cols - x_min, TILE_SIZE);
|
||||
__global const uchar* ptr = src + src_offset + ypix*src_step + x_min;
|
||||
int4 S = (int4)(0,0,0,0), p;
|
||||
|
||||
#define SUM_ELEM(elem, ofs) \
|
||||
(int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem
|
||||
(int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem
|
||||
|
||||
x = x_max & -4;
|
||||
if( x_max >= 4 )
|
||||
{
|
||||
p = convert_int4(vload4(0, ptr));
|
||||
@ -51,6 +50,30 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
|
||||
}
|
||||
}
|
||||
|
||||
if( x_max >= 20 )
|
||||
{
|
||||
p = convert_int4(vload4(0, ptr+16));
|
||||
S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
|
||||
|
||||
if( x_max >= 24 )
|
||||
{
|
||||
p = convert_int4(vload4(0, ptr+20));
|
||||
S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
|
||||
|
||||
if( x_max >= 28 )
|
||||
{
|
||||
p = convert_int4(vload4(0, ptr+24));
|
||||
S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
|
||||
|
||||
if( x_max >= 32 )
|
||||
{
|
||||
p = convert_int4(vload4(0, ptr+28));
|
||||
S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( x < x_max )
|
||||
{
|
||||
int ps = ptr[x];
|
||||
@ -68,27 +91,57 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
|
||||
}
|
||||
|
||||
int sy = y*y;
|
||||
m00 += S.s0;
|
||||
m10 += S.s1;
|
||||
m01 += y*S.s0;
|
||||
m20 += S.s2;
|
||||
m11 += y*S.s1;
|
||||
m02 += sy*S.s0;
|
||||
m30 += S.s3;
|
||||
m21 += y*S.s2;
|
||||
m12 += sy*S.s1;
|
||||
m03 += y*sy*S.s0;
|
||||
}
|
||||
|
||||
mom[0] = m00;
|
||||
mom[1] = m10;
|
||||
mom[2] = m01;
|
||||
mom[3] = m20;
|
||||
mom[4] = m11;
|
||||
mom[5] = m02;
|
||||
mom[6] = m30;
|
||||
mom[7] = m21;
|
||||
mom[8] = m12;
|
||||
mom[9] = m03;
|
||||
mom[y][0] = S.s0;
|
||||
mom[y][1] = S.s1;
|
||||
mom[y][2] = y*S.s0;
|
||||
mom[y][3] = S.s2;
|
||||
mom[y][4] = y*S.s1;
|
||||
mom[y][5] = sy*S.s0;
|
||||
mom[y][6] = S.s3;
|
||||
mom[y][7] = y*S.s2;
|
||||
mom[y][8] = sy*S.s1;
|
||||
mom[y][9] = y*sy*S.s0;
|
||||
}
|
||||
else
|
||||
mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =
|
||||
mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#define REDUCE(d) \
|
||||
if( y < d ) \
|
||||
{ \
|
||||
mom[y][0] += mom[y+d][0]; \
|
||||
mom[y][1] += mom[y+d][1]; \
|
||||
mom[y][2] += mom[y+d][2]; \
|
||||
mom[y][3] += mom[y+d][3]; \
|
||||
mom[y][4] += mom[y+d][4]; \
|
||||
mom[y][5] += mom[y+d][5]; \
|
||||
mom[y][6] += mom[y+d][6]; \
|
||||
mom[y][7] += mom[y+d][7]; \
|
||||
mom[y][8] += mom[y+d][8]; \
|
||||
mom[y][9] += mom[y+d][9]; \
|
||||
} \
|
||||
barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
REDUCE(16);
|
||||
REDUCE(8);
|
||||
REDUCE(4);
|
||||
REDUCE(2);
|
||||
|
||||
if( y == 0 )
|
||||
{
|
||||
__global int* momout = mom0 + (y0*xtiles + x0)*10;
|
||||
momout[0] = mom[0][0] + mom[1][0];
|
||||
momout[1] = mom[0][1] + mom[1][1];
|
||||
momout[2] = mom[0][2] + mom[1][2];
|
||||
momout[3] = mom[0][3] + mom[1][3];
|
||||
momout[4] = mom[0][4] + mom[1][4];
|
||||
momout[5] = mom[0][5] + mom[1][5];
|
||||
momout[6] = mom[0][6] + mom[1][6];
|
||||
momout[7] = mom[0][7] + mom[1][7];
|
||||
momout[8] = mom[0][8] + mom[1][8];
|
||||
momout[9] = mom[0][9] + mom[1][9];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user