From ede6d4482b96b7dfe16bd394bc1ccd4d53cf20b5 Mon Sep 17 00:00:00 2001 From: Aaron Kunze Date: Mon, 24 Mar 2014 13:35:56 -0700 Subject: [PATCH] Optimizations to OpenCL bilateral filter. --- modules/imgproc/src/opencl/bilateral.cl | 59 +++++++++++++++++++++-- modules/imgproc/src/smooth.cpp | 40 +++++++++------ modules/imgproc/test/ocl/test_filters.cpp | 32 ++++++++---- 3 files changed, 104 insertions(+), 27 deletions(-) diff --git a/modules/imgproc/src/opencl/bilateral.cl b/modules/imgproc/src/opencl/bilateral.cl index f459cfc850..963d23ec40 100644 --- a/modules/imgproc/src/opencl/bilateral.cl +++ b/modules/imgproc/src/opencl/bilateral.cl @@ -32,28 +32,79 @@ // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. + + __kernel void bilateral(__global const uchar * src, int src_step, int src_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs) + __constant float * space_weight, __constant int * space_ofs) { int x = get_global_id(0); int y = get_global_id(1); - if (y < dst_rows && x < dst_cols) { int src_index = mad24(y + radius, src_step, x + radius + src_offset); int dst_index = mad24(y, dst_step, x + dst_offset); float sum = 0.f, wsum = 0.f; int val0 = convert_int(src[src_index]); - #pragma unroll for (int k = 0; k < maxk; k++ ) { int val = convert_int(src[src_index + space_ofs[k]]); - float w = space_weight[k] * color_weight[abs(val - val0)]; + float w = space_weight[k] * native_exp((float)((val - val0) * (val - val0) * gauss_color_coeff)); sum += (float)(val) * w; wsum += w; } dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f); } } + +__kernel void bilateral_float(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * space_weight, __constant int * space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if (y < dst_rows && x < dst_cols) + { + int src_index = mad24(y + radius, src_step, x + radius + src_offset); + int dst_index = mad24(y, dst_step, x + dst_offset); + float sum = 0.f, wsum = 0.f; + float val0 = convert_float(src[src_index]); + #pragma unroll + for (int k = 0; k < maxk; k++ ) + { + float val = convert_float(src[src_index + space_ofs[k]]); + float w = space_weight[k] * native_exp((val - val0) * (val - val0) * gauss_color_coeff); + sum += (float)(val) * w; + wsum += w; + } + dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f); + } +} + + +__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * space_weight, __constant int * space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if (y < dst_rows && x < dst_cols / 4 ) + { + int src_index = ((y + radius) * src_step) + x * 4 + (radius + src_offset); + int dst_index = (y * dst_step) + x * 4 + dst_offset ; + float4 sum = 0.f, wsum = 0.f; + float4 val0 = convert_float4(vload4(0, src + src_index)); + #pragma unroll + for (int k = 0; k < maxk; k++ ) + { + float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k])); + float spacew = space_weight[k]; + float4 w = spacew * native_exp((val - val0) * (val - val0) * gauss_color_coeff); + sum += val * w; + wsum += w; + } + sum = sum / wsum + .5f; + vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index); + } +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 40687a226c..ae6a708642 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2210,7 +2210,7 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, double sigma_color, double sigma_space, int borderType) { - int type = _src.type(), cn = CV_MAT_CN(type); + int type = _src.type(); int i, j, maxk, radius; if ( type != CV_8UC1 ) @@ -2237,19 +2237,14 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _color_weight(cn * 256); std::vector _space_weight(d * d); std::vector _space_ofs(d * d); - float *color_weight = &_color_weight[0]; float *space_weight = &_space_weight[0]; int *space_ofs = &_space_ofs[0]; - // initialize color-related bilateral filter coefficients - for( i = 0; i < 256 * cn; i++ ) - color_weight[i] = (float)std::exp(i * i * gauss_color_coeff); - // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) + { for( j = -radius; j <= radius; j++ ) { double r = std::sqrt((double)i * i + (double)j * j); @@ -2258,26 +2253,43 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); space_ofs[maxk++] = (int)(i * temp.step + j); } + } - ocl::Kernel k("bilateral", ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d", radius, maxk)); + String kernelName("bilateral"); + size_t sizeDiv = 1; + + if ((ocl::Device::getDefault().isIntel()) && + (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) + { + //Intel GPU + if (dst.cols % 4 == 0) + { + kernelName = "bilateral_float4"; + sizeDiv = 4; + } + else + { + kernelName = "bilateral_float"; + } + } + ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, + format("-D radius=%d -D maxk=%d " + "-D gauss_color_coeff=%f", radius, maxk, + (float)gauss_color_coeff)); if (k.empty()) return false; - Mat mcolor_weight(1, cn * 256, CV_32FC1, color_weight); Mat mspace_weight(1, d * d, CV_32FC1, space_weight); Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - mcolor_weight.copyTo(ucolor_weight); + UMat uspace_weight, uspace_ofs; mspace_weight.copyTo(uspace_weight); mspace_ofs.copyTo(uspace_ofs); k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(ucolor_weight), ocl::KernelArg::PtrReadOnly(uspace_weight), ocl::KernelArg::PtrReadOnly(uspace_ofs)); - size_t globalsize[2] = { dst.cols, dst.rows }; + size_t globalsize[2] = { dst.cols / sizeDiv, dst.rows }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index fe16fe81d5..37e8961577 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -62,12 +62,14 @@ PARAM_TEST_CASE(FilterTestBase, MatType, Size, // dx, dy BorderType, // border type double, // optional parameter - bool) // roi or not + bool, // roi or not + int) //width multiplier { int type, borderType, ksize; Size size; double param; bool useRoi; + int widthMultiple; TEST_DECLARE_INPUT_PARAMETER(src) TEST_DECLARE_OUTPUT_PARAMETER(dst) @@ -80,6 +82,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType, borderType = GET_PARAM(3); param = GET_PARAM(4); useRoi = GET_PARAM(5); + widthMultiple = GET_PARAM(6); } void random_roi(int minSize = 1) @@ -88,6 +91,9 @@ PARAM_TEST_CASE(FilterTestBase, MatType, minSize = ksize; Size roiSize = randomSize(minSize, MAX_VALUE); + roiSize.width &= ~((widthMultiple * 2) - 1); + roiSize.width += widthMultiple; + Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256); @@ -320,7 +326,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1, 4))); OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine( FILTER_TYPES, @@ -328,7 +335,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(1.0, 0.2, 3.0), // kernel scale - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine( FILTER_TYPES, @@ -336,7 +344,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine( Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine( FILTER_TYPES, @@ -344,7 +353,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine( Values(Size(0, 1), Size(1, 0)), // dx, dy FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(1.0, 0.2), // kernel scale - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( FILTER_TYPES, @@ -352,7 +362,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine( Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4), @@ -360,7 +371,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine( Values(Size(0,0)),//not used Values((BorderType)BORDER_CONSTANT),//not used Values(1.0, 2.0, 3.0), - Bool() ) ); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine( Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4), @@ -368,7 +380,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine( Values(Size(0,0)),//not used Values((BorderType)BORDER_CONSTANT),//not used Values(1.0, 2.0, 3.0), - Bool() ) ); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine( Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4), @@ -376,7 +389,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine( Values(Size(0,0), Size(0,1), Size(0,2), Size(0,3), Size(0,4), Size(0,5),Size(0,6)),//uses as generator of operations Values((BorderType)BORDER_CONSTANT),//not used Values(1.0, 2.0, 3.0), - Bool() ) ); + Bool(), + Values(1))); // not used } } // namespace cvtest::ocl