diff --git a/modules/ocl/src/blend.cpp b/modules/ocl/src/blend.cpp index 40db57e869..75463b807f 100644 --- a/modules/ocl/src/blend.cpp +++ b/modules/ocl/src/blend.cpp @@ -77,8 +77,8 @@ void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat & int cols = img1.cols; int istep = img1.step1(); int wstep = weights1.step1(); - size_t globalSize[] = {cols * channels, rows, 1}; - size_t localSize[] = {16, 16, 1}; + size_t globalSize[] = {cols * channels / 4, rows, 1}; + size_t localSize[] = {256, 1, 1}; vector< pair > args; diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/kernels/blend_linear.cl index 3baaaa8f8d..06bde2f5c1 100644 --- a/modules/ocl/src/kernels/blend_linear.cl +++ b/modules/ocl/src/kernels/blend_linear.cl @@ -15,7 +15,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Liu Liujun, liujun@multicorewareinc.com +// Liu Liujun, liujun@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -43,11 +43,11 @@ // //M*/ __kernel void BlendLinear_C1_D0( - __global uchar *dst, - __global uchar *img1, - __global uchar *img2, - __global float *weight1, - __global float *weight2, + __global uchar4 *dst, + __global uchar4 *img1, + __global uchar4 *img2, + __global float4 *weight1, + __global float4 *weight2, int rows, int cols, int istep, @@ -56,47 +56,20 @@ __kernel void BlendLinear_C1_D0( { int idx = get_global_id(0); int idy = get_global_id(1); - if (idx < cols && idy < rows) + if (idx << 2 < cols && idy < rows) { - int pos = mad24(idy,istep,idx); - int wpos = mad24(idy,wstep,idx); - float w1 = weight1[wpos]; - float w2 = weight2[wpos]; - dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); - + int pos = mad24(idy,istep >> 2,idx); + int wpos = mad24(idy,wstep >> 2,idx); + float4 w1 = weight1[wpos], w2 = weight2[wpos]; + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + + convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); } } __kernel void BlendLinear_C4_D0( - __global uchar *dst, - __global uchar *img1, - __global uchar *img2, - __global float *weight1, - __global float *weight2, - int rows, - int cols, - int istep, - int wstep - ) -{ - int idx = get_global_id(0); - int idy = get_global_id(1); - int x = idx / 4; - int y = idy; - if (x < cols && y < rows) - { - int pos = mad24(idy,istep,idx); - int wpos = mad24(idy,wstep,x); - float w1 = weight1[wpos]; - float w2 = weight2[wpos]; - dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); - } -} - -__kernel void BlendLinear_C1_D5( - __global float *dst, - __global float *img1, - __global float *img2, + __global uchar4 *dst, + __global uchar4 *img1, + __global uchar4 *img2, __global float *weight1, __global float *weight2, int rows, @@ -109,18 +82,43 @@ __kernel void BlendLinear_C1_D5( int idy = get_global_id(1); if (idx < cols && idy < rows) { - int pos = mad24(idy,istep,idx); - int wpos = mad24(idy,wstep,idx); + int pos = mad24(idy,istep >> 2,idx); + int wpos = mad24(idy,wstep, idx); float w1 = weight1[wpos]; float w2 = weight2[wpos]; + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + + convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); + } +} + + +__kernel void BlendLinear_C1_D5( + __global float4 *dst, + __global float4 *img1, + __global float4 *img2, + __global float4 *weight1, + __global float4 *weight2, + int rows, + int cols, + int istep, + int wstep + ) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx << 2 < cols && idy < rows) + { + int pos = mad24(idy,istep >> 2,idx); + int wpos = mad24(idy,wstep >> 2,idx); + float4 w1 = weight1[wpos], w2 = weight2[wpos]; dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); } } __kernel void BlendLinear_C4_D5( - __global float *dst, - __global float *img1, - __global float *img2, + __global float4 *dst, + __global float4 *img1, + __global float4 *img2, __global float *weight1, __global float *weight2, int rows, @@ -131,12 +129,10 @@ __kernel void BlendLinear_C4_D5( { int idx = get_global_id(0); int idy = get_global_id(1); - int x = idx / 4; - int y = idy; - if (x < cols && y < rows) + if (idx < cols && idy < rows) { - int pos = mad24(idy,istep,idx); - int wpos = mad24(idy,wstep,x); + int pos = mad24(idy,istep >> 2,idx); + int wpos = mad24(idy,wstep, idx); float w1 = weight1[wpos]; float w2 = weight2[wpos]; dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);