mirror of
https://github.com/opencv/opencv.git
synced 2025-07-23 13:09:37 +08:00
5x5 gaussian blur optimization
Add new 5x5 gaussian blur kernel for CV_8UC1 format, it is 50% ~ 70% faster than current ocl kernel in the perf test. Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
c48d7f86ba
commit
396921dd23
198
modules/imgproc/src/opencl/gaussianBlur5x5.cl
Normal file
198
modules/imgproc/src/opencl/gaussianBlur5x5.cl
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#define DIG(a) a,
|
||||||
|
__constant float kx[] = { KERNEL_MATRIX_X };
|
||||||
|
__constant float ky[] = { KERNEL_MATRIX_Y };
|
||||||
|
|
||||||
|
#define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x])
|
||||||
|
|
||||||
|
#define FILL_ARR(s1, s2, n, e1, e2) \
|
||||||
|
arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \
|
||||||
|
arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234); \
|
||||||
|
arr[5 * n + 2] = (uchar4)(line[n].s2345); \
|
||||||
|
arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456); \
|
||||||
|
arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567);
|
||||||
|
|
||||||
|
__kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step,
|
||||||
|
__global uint* dst, int dst_step, int rows, int cols)
|
||||||
|
{
|
||||||
|
int x = get_global_id(0) * 4;
|
||||||
|
int y = get_global_id(1);
|
||||||
|
|
||||||
|
if (x >= cols || y >= rows) return;
|
||||||
|
|
||||||
|
uchar8 line[5];
|
||||||
|
int offset, src_index;
|
||||||
|
|
||||||
|
src_index = x + (y - 2) * src_step - 2;
|
||||||
|
offset = max(0, src_index + 2 * src_step);
|
||||||
|
line[2] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45);
|
||||||
|
|
||||||
|
#if defined BORDER_CONSTANT || defined BORDER_REPLICATE
|
||||||
|
uchar8 tmp;
|
||||||
|
#ifdef BORDER_CONSTANT
|
||||||
|
tmp = (uchar8)0;
|
||||||
|
#elif defined BORDER_REPLICATE
|
||||||
|
tmp = line[2];
|
||||||
|
#endif
|
||||||
|
line[0] = line[1] = tmp;
|
||||||
|
if (y > 1)
|
||||||
|
{
|
||||||
|
offset = max(0, src_index);
|
||||||
|
line[0] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (y > 0)
|
||||||
|
{
|
||||||
|
offset = max(0, src_index + src_step);
|
||||||
|
line[1] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
|
||||||
|
}
|
||||||
|
|
||||||
|
line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step);
|
||||||
|
line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step);
|
||||||
|
#elif BORDER_REFLECT
|
||||||
|
int t;
|
||||||
|
t = (y <= 1) ? (abs(y - 1) - y + 2) : 0;
|
||||||
|
offset = max(0, src_index + t * src_step);
|
||||||
|
line[0] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
|
||||||
|
|
||||||
|
if (y == 0)
|
||||||
|
line[1] = line[2];
|
||||||
|
else
|
||||||
|
{
|
||||||
|
offset = max(0, src_index + 1 * src_step);
|
||||||
|
line[1] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45);
|
||||||
|
}
|
||||||
|
|
||||||
|
line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step);
|
||||||
|
|
||||||
|
t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4;
|
||||||
|
line[4] = vload8(0, src + src_index + t * src_step);
|
||||||
|
#elif BORDER_REFLECT_101
|
||||||
|
if (y == 1)
|
||||||
|
line[0] = line[2];
|
||||||
|
else
|
||||||
|
{
|
||||||
|
offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index);
|
||||||
|
line[0] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45);
|
||||||
|
}
|
||||||
|
|
||||||
|
offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step);
|
||||||
|
line[1] = vload8(0, src + offset);
|
||||||
|
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45);
|
||||||
|
|
||||||
|
line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step);
|
||||||
|
if (y == (rows - 2))
|
||||||
|
line[4] = line[2];
|
||||||
|
else
|
||||||
|
{
|
||||||
|
line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bool row_s = (x == 0);
|
||||||
|
bool row_e = ((x + 4) == cols);
|
||||||
|
uchar4 arr[25];
|
||||||
|
uchar s, e;
|
||||||
|
|
||||||
|
#ifdef BORDER_CONSTANT
|
||||||
|
s = e = 0;
|
||||||
|
|
||||||
|
FILL_ARR(s, s, 0, e, e);
|
||||||
|
FILL_ARR(s, s, 1, e, e);
|
||||||
|
FILL_ARR(s, s, 2, e, e);
|
||||||
|
FILL_ARR(s, s, 3, e, e);
|
||||||
|
FILL_ARR(s, s, 4, e, e);
|
||||||
|
#elif defined BORDER_REPLICATE
|
||||||
|
s = line[0].s2;
|
||||||
|
e = line[0].s5;
|
||||||
|
FILL_ARR(s, s, 0, e, e);
|
||||||
|
|
||||||
|
s = line[1].s2;
|
||||||
|
e = line[1].s5;
|
||||||
|
FILL_ARR(s, s, 1, e, e);
|
||||||
|
|
||||||
|
s = line[2].s2;
|
||||||
|
e = line[2].s5;
|
||||||
|
FILL_ARR(s, s, 2, e, e);
|
||||||
|
|
||||||
|
s = line[3].s2;
|
||||||
|
e = line[3].s5;
|
||||||
|
FILL_ARR(s, s, 3, e, e);
|
||||||
|
|
||||||
|
s = line[4].s2;
|
||||||
|
e = line[4].s5;
|
||||||
|
FILL_ARR(s, s, 4, e, e);
|
||||||
|
#elif BORDER_REFLECT
|
||||||
|
uchar s1, s2;
|
||||||
|
uchar e1, e2;
|
||||||
|
|
||||||
|
s1 = line[0].s3;
|
||||||
|
s2 = line[0].s2;
|
||||||
|
e1 = line[0].s5;
|
||||||
|
e2 = line[0].s4;
|
||||||
|
FILL_ARR(s1, s2, 0, e1, e2);
|
||||||
|
|
||||||
|
s1 = line[1].s3;
|
||||||
|
s2 = line[1].s2;
|
||||||
|
e1 = line[1].s5;
|
||||||
|
e2 = line[1].s4;
|
||||||
|
FILL_ARR(s1, s2, 1, e1, e2);
|
||||||
|
|
||||||
|
s1 = line[2].s3;
|
||||||
|
s2 = line[2].s2;
|
||||||
|
e1 = line[2].s5;
|
||||||
|
e2 = line[2].s4;
|
||||||
|
FILL_ARR(s1, s2, 2, e1, e2);
|
||||||
|
|
||||||
|
s1 = line[3].s3;
|
||||||
|
s2 = line[3].s2;
|
||||||
|
e1 = line[3].s5;
|
||||||
|
e2 = line[3].s4;
|
||||||
|
FILL_ARR(s1, s2, 3, e1, e2);
|
||||||
|
|
||||||
|
s1 = line[4].s3;
|
||||||
|
s2 = line[4].s2;
|
||||||
|
e1 = line[4].s5;
|
||||||
|
e2 = line[4].s4;
|
||||||
|
FILL_ARR(s1, s2, 4, e1, e2);
|
||||||
|
#elif BORDER_REFLECT_101
|
||||||
|
s = line[0].s4;
|
||||||
|
e = line[0].s3;
|
||||||
|
FILL_ARR(s, e, 0, s, e);
|
||||||
|
|
||||||
|
s = line[1].s4;
|
||||||
|
e = line[1].s3;
|
||||||
|
FILL_ARR(s, e, 1, s, e);
|
||||||
|
|
||||||
|
s = line[2].s4;
|
||||||
|
e = line[2].s3;
|
||||||
|
FILL_ARR(s, e, 2, s, e);
|
||||||
|
|
||||||
|
s = line[3].s4;
|
||||||
|
e = line[3].s3;
|
||||||
|
FILL_ARR(s, e, 3, s, e);
|
||||||
|
|
||||||
|
s = line[4].s4;
|
||||||
|
e = line[4].s3;
|
||||||
|
FILL_ARR(s, e, 4, s, e);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float4 sum;
|
||||||
|
sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) +
|
||||||
|
OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) +
|
||||||
|
OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) +
|
||||||
|
OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) +
|
||||||
|
OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4);
|
||||||
|
|
||||||
|
int dst_index = (x / 4) + y * (dst_step / 4);
|
||||||
|
dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum));
|
||||||
|
}
|
@ -2135,15 +2135,16 @@ namespace cv
|
|||||||
{
|
{
|
||||||
#ifdef HAVE_OPENCL
|
#ifdef HAVE_OPENCL
|
||||||
|
|
||||||
static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth,
|
static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth,
|
||||||
InputArray _kernelX, InputArray _kernelY, int borderType)
|
InputArray _kernelX, InputArray _kernelY, int borderType)
|
||||||
{
|
{
|
||||||
const ocl::Device & dev = ocl::Device::getDefault();
|
const ocl::Device & dev = ocl::Device::getDefault();
|
||||||
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||||
|
|
||||||
if ( !(dev.isIntel() && (type == CV_8UC1) &&
|
if ( !(dev.isIntel() && (type == CV_8UC1) &&
|
||||||
(_src.offset() == 0) && (_src.step() % 4 == 0) &&
|
(_src.offset() == 0) && (_src.step() % 4 == 0) &&
|
||||||
(_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) )
|
((ksize.width == 5 && (_src.cols() % 4 == 0)) ||
|
||||||
|
(ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
Mat kernelX = _kernelX.getMat().reshape(1, 1);
|
Mat kernelX = _kernelX.getMat().reshape(1, 1);
|
||||||
@ -2160,8 +2161,16 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep
|
|||||||
size_t globalsize[2] = { 0, 0 };
|
size_t globalsize[2] = { 0, 0 };
|
||||||
size_t localsize[2] = { 0, 0 };
|
size_t localsize[2] = { 0, 0 };
|
||||||
|
|
||||||
globalsize[0] = size.width / 16;
|
if (ksize.width == 3)
|
||||||
globalsize[1] = size.height / 2;
|
{
|
||||||
|
globalsize[0] = size.width / 16;
|
||||||
|
globalsize[1] = size.height / 2;
|
||||||
|
}
|
||||||
|
else if (ksize.width == 5)
|
||||||
|
{
|
||||||
|
globalsize[0] = size.width / 4;
|
||||||
|
globalsize[1] = size.height / 1;
|
||||||
|
}
|
||||||
|
|
||||||
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
|
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
|
||||||
char build_opts[1024];
|
char build_opts[1024];
|
||||||
@ -2169,7 +2178,13 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep
|
|||||||
ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
|
ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(),
|
||||||
ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
|
ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str());
|
||||||
|
|
||||||
ocl::Kernel kernel("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
|
ocl::Kernel kernel;
|
||||||
|
|
||||||
|
if (ksize.width == 3)
|
||||||
|
kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts);
|
||||||
|
else if (ksize.width == 5)
|
||||||
|
kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts);
|
||||||
|
|
||||||
if (kernel.empty())
|
if (kernel.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -2436,9 +2451,10 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
|
|||||||
createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
|
createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
|
||||||
|
|
||||||
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 &&
|
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 &&
|
||||||
ksize.width == 3 && ksize.height == 3 &&
|
((ksize.width == 3 && ksize.height == 3) ||
|
||||||
|
(ksize.width == 5 && ksize.height == 5)) &&
|
||||||
(size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(),
|
(size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(),
|
||||||
ocl_GaussianBlur3x3_8UC1(_src, _dst, CV_MAT_DEPTH(type), kx, ky, borderType));
|
ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
|
||||||
|
|
||||||
sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
|
sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
|
||||||
}
|
}
|
||||||
|
@ -342,7 +342,7 @@ OCL_TEST_P(GaussianBlurTest, Mat)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
|
PARAM_TEST_CASE(GaussianBlur_multicols_Base, MatType,
|
||||||
int, // kernel size
|
int, // kernel size
|
||||||
Size, // dx, dy
|
Size, // dx, dy
|
||||||
BorderType, // border type
|
BorderType, // border type
|
||||||
@ -372,11 +372,18 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
|
|||||||
|
|
||||||
void random_roi()
|
void random_roi()
|
||||||
{
|
{
|
||||||
size = Size(3, 3);
|
size = Size(ksize, ksize);
|
||||||
|
|
||||||
Size roiSize = randomSize(size.width, MAX_VALUE, size.height, MAX_VALUE);
|
Size roiSize = randomSize(size.width, MAX_VALUE, size.height, MAX_VALUE);
|
||||||
roiSize.width = std::max(size.width + 13, roiSize.width & (~0xf));
|
if (ksize == 3)
|
||||||
roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1));
|
{
|
||||||
|
roiSize.width = std::max((size.width + 15) & 0x10, roiSize.width & (~0xf));
|
||||||
|
roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1));
|
||||||
|
}
|
||||||
|
else if (ksize == 5)
|
||||||
|
{
|
||||||
|
roiSize.width = std::max((size.width + 3) & 0x4, roiSize.width & (~0x3));
|
||||||
|
}
|
||||||
|
|
||||||
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||||
randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
|
randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
|
||||||
@ -402,9 +409,9 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType,
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef GaussianBlur3x3_cols16_rows2_Base GaussianBlur3x3_cols16_rows2;
|
typedef GaussianBlur_multicols_Base GaussianBlur_multicols;
|
||||||
|
|
||||||
OCL_TEST_P(GaussianBlur3x3_cols16_rows2, Mat)
|
OCL_TEST_P(GaussianBlur_multicols, Mat)
|
||||||
{
|
{
|
||||||
Size kernelSize(ksize, ksize);
|
Size kernelSize(ksize, ksize);
|
||||||
|
|
||||||
@ -710,9 +717,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
|
|||||||
Bool(),
|
Bool(),
|
||||||
Values(1))); // not used
|
Values(1))); // not used
|
||||||
|
|
||||||
OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur3x3_cols16_rows2, Combine(
|
OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur_multicols, Combine(
|
||||||
Values((MatType)CV_8UC1),
|
Values((MatType)CV_8UC1),
|
||||||
Values(3), // kernel size
|
Values(3, 5), // kernel size
|
||||||
Values(Size(0, 0)), // not used
|
Values(Size(0, 0)), // not used
|
||||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||||
Values(0.0), // not used
|
Values(0.0), // not used
|
||||||
|
Loading…
Reference in New Issue
Block a user