diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d28d6b9046..9731694e59 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,5 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp index e9181f2182..a27ebb18f5 100644 --- a/modules/imgproc/src/bilateral_filter.dispatch.cpp +++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp @@ -48,493 +48,14 @@ #include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" +#include "bilateral_filter.simd.hpp" +#include "bilateral_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ - -class BilateralFilter_8u_Invoker : - public ParallelLoopBody -{ -public: - BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, - int* _space_ofs, float *_space_weight, float *_color_weight) : - temp(&_temp), dest(&_dest), radius(_radius), - maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, cn = dest->channels(), k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const uchar* sptr = temp->ptr(i+radius) + radius*cn; - uchar* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 rval = vx_load_expand_q(sptr + j); - - v_uint32 val = vx_load_expand_q(ksptr0 + j); - v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)); - - val = vx_load_expand_q(ksptr1 + j); - w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr2 + j); - w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr3 + j); - w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_uint32x4 rval = v_setall_u32(sptr[j]); - v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w); -#else - int rval = sptr[j]; - - int val = ksptr0[j]; - float w = space_weight[k] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr1[j]; - w = space_weight[k+1] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr2[j]; - w = space_weight[k+2] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr3[j]; - w = space_weight[k+3] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 val = vx_load_expand_q(ksptr + j); - v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - int val = ksptr[j]; - float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; - wsum[j] += w; - sum[j] += val * w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) - v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), - v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); -#endif - for (; j < size.width; j++) - { - // overflow is not possible here => there is no need to use cv::saturate_cast - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); - } - } - else - { - assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes, - ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_uint16 val0, val1, val2, val3, val4; - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_uint32 vall, valh; - v_expand(val0, vall, valh); - v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - - v_load_deinterleave(ksptr1, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr2, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr3, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_uint32x4 rb = v_setall_u32(rsptr[0]); - v_uint32x4 rg = v_setall_u32(rsptr[1]); - v_uint32x4 rr = v_setall_u32(rsptr[2]); - v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr))); - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w); - sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w); - sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w); -#else - int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - - int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; - v_expand(v_absdiff(kb, rb), b_l, b_h); - v_expand(v_absdiff(kg, rg), g_l, g_h); - v_expand(v_absdiff(kr, rr), r_l, r_h); - - v_uint32 val0, val1, val2, val3; - v_expand(b_l + g_l + r_l, val0, val1); - v_expand(b_h + g_h + r_h, val2, val3); - - v_expand(kb, b_l, b_h); - v_expand(kg, g_l, g_h); - v_expand(kr, r_l, r_h); - - v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); - v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); - v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); - v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); - v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); - v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); - v_expand(b_l, val0, val1); - v_expand(b_h, val2, val3); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); - v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); - v_expand(g_l, val0, val1); - v_expand(g_h, val2, val3); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); - v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); - v_expand(r_l, val0, val1); - v_expand(r_h, val2, val3); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - } -#endif - for(; j < size.width; j++, ksptr += 3, rsptr += 3) - { - int b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - } - } - j = 0; -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) - { - v_float32 w0 = v_one / vx_load_aligned(wsum + j); - v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); - v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); - v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); - - v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), - v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), - v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), - v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); - } -#endif - for(; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f/wsum[j]; - *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - const Mat *temp; - Mat *dest; - int radius, maxk, *space_ofs; - float *space_weight, *color_weight; -}; +namespace cv { #ifdef HAVE_OPENCL @@ -542,6 +63,7 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, double sigma_color, double sigma_space, int borderType) { + CV_INSTRUMENT_REGION(); #ifdef __ANDROID__ if (ocl::Device::getDefault().isNVidia()) return false; @@ -628,16 +150,18 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; return k.run(2, globalsize, NULL, false); } - #endif + + static void bilateralFilter_8u( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; - Size size = src.size(); CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); @@ -686,479 +210,18 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, } } - BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_8u, (dst, temp, radius, maxk, space_ofs, space_weight, color_weight), + CV_CPU_DISPATCH_MODES_ALL); } -class BilateralFilter_32f_Invoker : - public ParallelLoopBody -{ -public: - - BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, - const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : - cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), - temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const float* sptr = temp->ptr(i+radius) + radius*cn; - float* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for(; k <= maxk - 4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k + 1]; - const float* ksptr2 = sptr + space_ofs[k + 2]; - const float* ksptr3 = sptr + space_ofs[k + 3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 rval = vx_load(sptr + j); - - v_float32 val = vx_load(ksptr0 + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j)); - - val = vx_load(ksptr1 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr2 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr3 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_float32x4 rval = v_setall_f32(sptr[j]); - v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 knan = v_not_nan(val); - v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum((val & knan) * w); -#else - float rval = sptr[j]; - - float val = ksptr0[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr1[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr2[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr3[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } -#endif - } - } - for(; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 val = vx_load(ksptr + j); - v_float32 rval = vx_load(sptr + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - float val = ksptr[j]; - float rval = sptr[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 v_val = vx_load(sptr + j); - v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val)))); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f); - } - } - else - { - CV_Assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for (; k <= maxk-4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k+1]; - const float* ksptr2 = sptr + space_ofs[k+2]; - const float* ksptr3 = sptr + space_ofs[k+3]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes, - ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)); - v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)); - v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)); - - v_load_deinterleave(ksptr1, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr2, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr3, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum_b + j, v_sum_b); - v_store_aligned(sum_g + j, v_sum_g); - v_store_aligned(sum_r + j, v_sum_r); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_float32x4 rb = v_setall_f32(rsptr[0]); - v_float32x4 rg = v_setall_f32(rsptr[1]); - v_float32x4 rr = v_setall_f32(rsptr[2]); - v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum((kb & knan) * w); - sum_g[j] += v_reduce_sum((kg & knan) * w); - sum_r[j] += v_reduce_sum((kr & knan) * w); -#else - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - - float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } -#endif - } - } - for (; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j))); - } -#endif - for (; j < size.width; j++, ksptr += 3, rsptr += 3) - { - float b = ksptr[0], g = ksptr[1], r = ksptr[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes) - { - v_float32 b, g, r; - v_load_deinterleave(sptr, b, g, r); - v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r); - v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask)); - v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - float b = *(sptr++); - float g = *(sptr++); - float r = *(sptr++); - if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r)) - { - wsum[j] = 1.f / wsum[j]; - *(dptr++) = sum_b[j] * wsum[j]; - *(dptr++) = sum_g[j] * wsum[j]; - *(dptr++) = sum_r[j] * wsum[j]; - } - else - { - wsum[j] = 1.f / (wsum[j] + 1.f); - *(dptr++) = (sum_b[j] + b) * wsum[j]; - *(dptr++) = (sum_g[j] + g) * wsum[j]; - *(dptr++) = (sum_r[j] + r) * wsum[j]; - } - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - int cn, radius, maxk, *space_ofs; - const Mat* temp; - Mat *dest; - float scale_index, *space_weight, *expLUT; -}; - - static void bilateralFilter_32f( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; double minValSrc=-1, maxValSrc=1; @@ -1166,7 +229,6 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, int kExpNumBins = 0; float lastExpVal = 1.f; float len, scale_index; - Size size = src.size(); CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); @@ -1236,9 +298,8 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, } // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_32f, (cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT), + CV_CPU_DISPATCH_MODES_ALL); } #ifdef HAVE_IPP @@ -1339,9 +400,7 @@ static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, do } #endif -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, +void bilateralFilter( InputArray _src, OutputArray _dst, int d, double sigmaColor, double sigmaSpace, int borderType ) { @@ -1365,4 +424,4 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, "Bilateral filtering is only implemented for 8u and 32f images" ); } -/* End of file. */ +} // namespace diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp index e9181f2182..65abcd4e40 100644 --- a/modules/imgproc/src/bilateral_filter.simd.hpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -43,18 +43,25 @@ #include "precomp.hpp" -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight); +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT); +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace { class BilateralFilter_8u_Invoker : public ParallelLoopBody { @@ -68,6 +75,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, cn = dest->channels(), k; Size size = dest->size(); @@ -536,161 +545,20 @@ private: float *space_weight, *color_weight; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, - double sigma_color, double sigma_space, - int borderType) +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight) { -#ifdef __ANDROID__ - if (ocl::Device::getDefault().isNVidia()) - return false; -#endif - - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int i, j, maxk, radius; - - if (depth != CV_8U || cn > 4) - return false; - - if (sigma_color <= 0) - sigma_color = 1; - if (sigma_space <= 0) - sigma_space = 1; - - double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); - double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); - - if ( d <= 0 ) - radius = cvRound(sigma_space * 1.5); - else - radius = d / 2; - radius = MAX(radius, 1); - d = radius * 2 + 1; - - UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; - if (src.u == dst.u) - return false; - - copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _space_weight(d * d); - std::vector _space_ofs(d * d); - float * const space_weight = &_space_weight[0]; - int * const space_ofs = &_space_ofs[0]; - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i * i + (double)j * j); - if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - String kernelName("bilateral"); - size_t sizeDiv = 1; - if ((ocl::Device::getDefault().isIntel()) && - (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) - { - //Intel GPU - if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. - { - kernelName = "bilateral_float4"; - sizeDiv = 4; - } - } - ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); - if (k.empty()) - return false; - - Mat mspace_weight(1, d * d, CV_32FC1, space_weight); - Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - - mspace_weight.copyTo(uspace_weight); - mspace_ofs.copyTo(uspace_ofs); - - k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(uspace_weight), - ocl::KernelArg::PtrReadOnly(uspace_ofs)); - - size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; - return k.run(2, globalsize, NULL, false); -} - -#endif -static void -bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - Size size = src.size(); - - CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - std::vector _color_weight(cn*256); - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* color_weight = &_color_weight[0]; - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // initialize color-related bilateral filter coefficients - - for( i = 0; i < 256*cn; i++ ) - color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; - - for( ; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*temp.step + j*cn); - } - } - + CV_INSTRUMENT_REGION(); BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } +namespace { + class BilateralFilter_32f_Invoker : public ParallelLoopBody { @@ -705,6 +573,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k; Size size = dest->size(); @@ -1153,216 +1023,18 @@ private: float scale_index, *space_weight, *expLUT; }; +} // namespace anon -static void -bilateralFilter_32f( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - double minValSrc=-1, maxValSrc=1; - const int kExpNumBinsPerChannel = 1 << 12; - int kExpNumBins = 0; - float lastExpVal = 1.f; - float len, scale_index; - Size size = src.size(); - - CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - // compute the min/max range for the input image (even if multichannel) - - minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - src.copyTo(dst); - return; - } - - // temporary copy of the image with borders for easy processing - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - // allocate lookup tables - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // assign a length which is slightly more than needed - len = (float)(maxValSrc - minValSrc) * cn; - kExpNumBins = kExpNumBinsPerChannel * cn; - std::vector _expLUT(kExpNumBins+2); - float* expLUT = &_expLUT[0]; - - scale_index = kExpNumBins/len; - - // initialize the exp LUT - for( i = 0; i < kExpNumBins+2; i++ ) - { - if( lastExpVal > 0.f ) - { - double val = i / scale_index; - expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); - lastExpVal = expLUT[i]; - } - else - expLUT[i] = 0.f; - } - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius || ( i == 0 && j == 0 ) ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); - } - - // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); -} - -#ifdef HAVE_IPP -#define IPP_BILATERAL_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_bilateralFilterParallel: public ParallelLoopBody -{ -public: - ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): - src(_src), dst(_dst) - { - pOk = _ok; - - radius = _radius; - valSquareSigma = _valSquareSigma; - posSquareSigma = _posSquareSigma; - borderType = _borderType; - - *pOk = true; - } - ~ipp_bilateralFilterParallel() {} - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - if(*pOk == false) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); - } - catch(const ::ipp::IwException &) - { - *pOk = false; - return; - } - } -private: - ::ipp::IwiImage &src; - ::ipp::IwiImage &dst; - - int radius; - Ipp32f valSquareSigma; - Ipp32f posSquareSigma; - ::ipp::IwiBorderType borderType; - - bool *pOk; - const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); -}; -#endif - -static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); - Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); - Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); - - // Acquire data and begin processing - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize(radius); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_BILATERAL_PARALLEL && threads > 1) { - bool ok = true; - Range range(0, (int)iwDst.m_size.height); - ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); - if(!ok) - return false; - - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); - return false; -#endif -} -#endif - -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, - double sigmaColor, double sigmaSpace, - int borderType ) +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT) { CV_INSTRUMENT_REGION(); - _dst.create( _src.size(), _src.type() ); - - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) - - Mat src = _src.getMat(), dst = _dst.getMat(); - - CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); - - if( src.depth() == CV_8U ) - bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else if( src.depth() == CV_32F ) - bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else - CV_Error( CV_StsUnsupportedFormat, - "Bilateral filtering is only implemented for 8u and 32f images" ); + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace