Use fma only for Intel devices

This commit is contained in:
Alexander Karsakov 2014-06-23 10:18:11 +04:00
parent eeddda4701
commit d5c99a07b6
2 changed files with 69 additions and 58 deletions

View File

@ -80,7 +80,13 @@
#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x))) #define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
#if kercn == 4 #if kercn == 4
#define SRC4(_x,_y) convert_float4(*(__global const uchar4*)(srcData + mad24(_y, src_step, PIXSIZE * _x))) #define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
#endif
#ifdef INTEL_DEVICE
#define MAD(x,y,z) fma((x),(y),(z))
#else
#define MAD(x,y,z) mad((x),(y),(z))
#endif #endif
#define noconvert #define noconvert
@ -109,10 +115,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE(x, src_cols); col = EXTRAPOLATE(x, src_cols);
sum = co3* SRC(col, src_y - 2); sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum); sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum); sum = MAD(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum); sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum); sum = MAD(co3, SRC(col, src_y + 2), sum);
smem[2 + get_local_id(0)] = sum; smem[2 + get_local_id(0)] = sum;
#else #else
@ -120,10 +126,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{ {
float4 sum4; float4 sum4;
sum4 = co3* SRC4(x, src_y - 2); sum4 = co3* SRC4(x, src_y - 2);
sum4 = fma(co2, SRC4(x, src_y - 1), sum4); sum4 = MAD(co2, SRC4(x, src_y - 1), sum4);
sum4 = fma(co1, SRC4(x, src_y ), sum4); sum4 = MAD(co1, SRC4(x, src_y ), sum4);
sum4 = fma(co2, SRC4(x, src_y + 1), sum4); sum4 = MAD(co2, SRC4(x, src_y + 1), sum4);
sum4 = fma(co3, SRC4(x, src_y + 2), sum4); sum4 = MAD(co3, SRC4(x, src_y + 2), sum4);
vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
} }
@ -133,10 +139,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{ {
col = EXTRAPOLATE(x+i, src_cols); col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, src_y - 2); sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum); sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum); sum = MAD(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum); sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum); sum = MAD(co3, SRC(col, src_y + 2), sum);
smem[2 + 4*get_local_id(0)+i] = sum; smem[2 + 4*get_local_id(0)+i] = sum;
} }
@ -147,10 +153,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, src_y - 2); sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum); sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum); sum = MAD(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum); sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum); sum = MAD(co3, SRC(col, src_y + 2), sum);
smem[get_local_id(0)] = sum; smem[get_local_id(0)] = sum;
} }
@ -160,10 +166,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, src_y - 2); sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum); sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum); sum = MAD(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum); sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum); sum = MAD(co3, SRC(col, src_y + 2), sum);
smem[LOCAL_SIZE + get_local_id(0)] = sum; smem[LOCAL_SIZE + get_local_id(0)] = sum;
} }
@ -174,10 +180,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE(x, src_cols); col = EXTRAPOLATE(x, src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[2 + get_local_id(0)] = sum; smem[2 + get_local_id(0)] = sum;
#else #else
@ -185,10 +191,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{ {
float4 sum4; float4 sum4;
sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows)); sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows));
sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4); sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4);
sum4 = fma(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4); sum4 = MAD(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4);
sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4); sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4);
sum4 = fma(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4); sum4 = MAD(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4);
vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
} }
@ -198,10 +204,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{ {
col = EXTRAPOLATE(x+i, src_cols); col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[2 + 4*get_local_id(0)+i] = sum; smem[2 + 4*get_local_id(0)+i] = sum;
} }
@ -212,10 +218,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[get_local_id(0)] = sum; smem[get_local_id(0)] = sum;
} }
@ -225,10 +231,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[LOCAL_SIZE + get_local_id(0)] = sum; smem[LOCAL_SIZE + get_local_id(0)] = sum;
} }
@ -241,15 +247,20 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{ {
const int tid2 = get_local_id(0) * 2; const int tid2 = get_local_id(0) * 2;
sum = co3* smem[2 + tid2 + 2]; sum = 0.f;
#if cn == 1 #if cn == 1
#if fdepth <= 5
sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2)); sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2));
#else #else
sum = fma(co3, smem[2 + tid2 - 2], sum); sum = sum + dot(vload4(0, (__local double*) (&smem)+tid2), (double4)(co3, co2, co1, co2));
sum = fma(co2, smem[2 + tid2 - 1], sum);
sum = fma(co1, smem[2 + tid2 ], sum);
sum = fma(co2, smem[2 + tid2 + 1], sum);
#endif #endif
#else
sum = MAD(co3, smem[2 + tid2 - 2], sum);
sum = MAD(co2, smem[2 + tid2 - 1], sum);
sum = MAD(co1, smem[2 + tid2 ], sum);
sum = MAD(co2, smem[2 + tid2 + 1], sum);
#endif
sum = MAD(co3, smem[2 + tid2 + 2], sum);
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
@ -260,10 +271,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
int tid4 = get_local_id(0) * 4; int tid4 = get_local_id(0) * 4;
sum = co3* smem[2 + tid4 + 2]; sum = co3* smem[2 + tid4 + 2];
sum = fma(co3, smem[2 + tid4 - 2], sum); sum = MAD(co3, smem[2 + tid4 - 2], sum);
sum = fma(co2, smem[2 + tid4 - 1], sum); sum = MAD(co2, smem[2 + tid4 - 1], sum);
sum = fma(co1, smem[2 + tid4 ], sum); sum = MAD(co1, smem[2 + tid4 ], sum);
sum = fma(co2, smem[2 + tid4 + 1], sum); sum = MAD(co2, smem[2 + tid4 + 1], sum);
int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2; int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
@ -274,10 +285,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
dst_x += 1; dst_x += 1;
sum = co3* smem[2 + tid4 + 2]; sum = co3* smem[2 + tid4 + 2];
sum = fma(co3, smem[2 + tid4 - 2], sum); sum = MAD(co3, smem[2 + tid4 - 2], sum);
sum = fma(co2, smem[2 + tid4 - 1], sum); sum = MAD(co2, smem[2 + tid4 - 1], sum);
sum = fma(co1, smem[2 + tid4 ], sum); sum = MAD(co1, smem[2 + tid4 ], sum);
sum = fma(co2, smem[2 + tid4 + 1], sum); sum = MAD(co2, smem[2 + tid4 + 1], sum);
if (dst_x < dst_cols) if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE)); storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));

View File

@ -424,19 +424,19 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
int float_depth = depth == CV_64F ? CV_64F : CV_32F; int float_depth = depth == CV_64F ? CV_64F : CV_32F;
const int local_size = 256; const int local_size = 256;
int kercn = 1; int kercn = 1;
if (depth == CV_8U && cn == 1 && float_depth == CV_32F) if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel())
kercn = 4; kercn = 4;
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
"BORDER_REFLECT_101" }; "BORDER_REFLECT_101" };
char cvt[2][50]; char cvt[2][50];
String buildOptions = format( String buildOptions = format(
"-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
"-D T1=%s -D cn=%d -D kercn=%d -D %s -D LOCAL_SIZE=%d", "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)), ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
ocl::convertTypeStr(float_depth, depth, cn, cvt[0]), ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
ocl::convertTypeStr(depth, float_depth, cn, cvt[1]), ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth),
ocl::typeToStr(depth), cn, kercn, borderMap[borderType], local_size cn, kercn, float_depth, borderMap[borderType], local_size
); );
ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions); ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
if (k.empty()) if (k.empty())