From 6eac8f78b9144cbd311a8e17c3f0ea4f1792b8f4 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 10:56:50 +0000 Subject: [PATCH 1/9] imgproc: copy .simd.hpp --- .../src/{bilateral_filter.cpp => bilateral_filter.simd.hpp} | 0 modules/imgproc/src/{box_filter.cpp => box_filter.simd.hpp} | 0 modules/imgproc/src/{filter.cpp => filter.simd.hpp} | 0 modules/imgproc/src/{median_blur.cpp => median_blur.simd.hpp} | 0 modules/imgproc/src/{morph.cpp => morph.simd.hpp} | 0 modules/imgproc/src/{smooth.cpp => smooth.simd.hpp} | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{bilateral_filter.cpp => bilateral_filter.simd.hpp} (100%) rename modules/imgproc/src/{box_filter.cpp => box_filter.simd.hpp} (100%) rename modules/imgproc/src/{filter.cpp => filter.simd.hpp} (100%) rename modules/imgproc/src/{median_blur.cpp => median_blur.simd.hpp} (100%) rename modules/imgproc/src/{morph.cpp => morph.simd.hpp} (100%) rename modules/imgproc/src/{smooth.cpp => smooth.simd.hpp} (100%) diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.simd.hpp similarity index 100% rename from modules/imgproc/src/bilateral_filter.cpp rename to modules/imgproc/src/bilateral_filter.simd.hpp diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.simd.hpp similarity index 100% rename from modules/imgproc/src/box_filter.cpp rename to modules/imgproc/src/box_filter.simd.hpp diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.simd.hpp similarity index 100% rename from modules/imgproc/src/filter.cpp rename to modules/imgproc/src/filter.simd.hpp diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.simd.hpp similarity index 100% rename from modules/imgproc/src/median_blur.cpp rename to modules/imgproc/src/median_blur.simd.hpp diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.simd.hpp similarity index 100% rename from modules/imgproc/src/morph.cpp rename to modules/imgproc/src/morph.simd.hpp diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.simd.hpp similarity index 100% rename from modules/imgproc/src/smooth.cpp rename to modules/imgproc/src/smooth.simd.hpp From 9dc755408982d4416260d9f7ee6bccc23c2a333d Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 10:57:05 +0000 Subject: [PATCH 2/9] imgproc: copy .dispatch.cpp --- .../src/{bilateral_filter.cpp => bilateral_filter.dispatch.cpp} | 0 modules/imgproc/src/{box_filter.cpp => box_filter.dispatch.cpp} | 0 modules/imgproc/src/{filter.cpp => filter.dispatch.cpp} | 0 modules/imgproc/src/{median_blur.cpp => median_blur.dispatch.cpp} | 0 modules/imgproc/src/{morph.cpp => morph.dispatch.cpp} | 0 modules/imgproc/src/{smooth.cpp => smooth.dispatch.cpp} | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{bilateral_filter.cpp => bilateral_filter.dispatch.cpp} (100%) rename modules/imgproc/src/{box_filter.cpp => box_filter.dispatch.cpp} (100%) rename modules/imgproc/src/{filter.cpp => filter.dispatch.cpp} (100%) rename modules/imgproc/src/{median_blur.cpp => median_blur.dispatch.cpp} (100%) rename modules/imgproc/src/{morph.cpp => morph.dispatch.cpp} (100%) rename modules/imgproc/src/{smooth.cpp => smooth.dispatch.cpp} (100%) diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/bilateral_filter.cpp rename to modules/imgproc/src/bilateral_filter.dispatch.cpp diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/box_filter.cpp rename to modules/imgproc/src/box_filter.dispatch.cpp diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/filter.cpp rename to modules/imgproc/src/filter.dispatch.cpp diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.dispatch.cpp similarity index 100% rename from modules/imgproc/src/median_blur.cpp rename to modules/imgproc/src/median_blur.dispatch.cpp diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.dispatch.cpp similarity index 100% rename from modules/imgproc/src/morph.cpp rename to modules/imgproc/src/morph.dispatch.cpp diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.dispatch.cpp similarity index 100% rename from modules/imgproc/src/smooth.cpp rename to modules/imgproc/src/smooth.dispatch.cpp From 9a8dbfd57fab0b9a7777f4baad0da8d23f8a8756 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 12:21:23 +0000 Subject: [PATCH 3/9] imgproc: dispatch filter.cpp --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/filter.dispatch.cpp | 2985 +---------------------- modules/imgproc/src/filter.hpp | 2 + modules/imgproc/src/filter.simd.hpp | 1559 ++---------- 4 files changed, 258 insertions(+), 4289 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 6232aa5fab..d3afe151bd 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,5 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp index 43200218dc..b6f5331028 100644 --- a/modules/imgproc/src/filter.dispatch.cpp +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -47,19 +47,15 @@ #include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" +#include "filter.simd.hpp" +#include "filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + /****************************************************************************************\ Base Image Filter \****************************************************************************************/ -#if IPP_VERSION_X100 >= 710 -#define USE_IPP_SEP_FILTERS 1 -#else -#undef USE_IPP_SEP_FILTERS -#endif - -namespace cv -{ +namespace cv { BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } BaseRowFilter::~BaseRowFilter() {} @@ -163,107 +159,12 @@ void FilterEngine::init( const Ptr& _filter2D, #define VEC_ALIGN CV_MALLOC_ALIGN -int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs) +int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs) { - int i, j; + CV_INSTRUMENT_REGION(); - wholeSize = _wholeSize; - roi = Rect(ofs, sz); - CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && - roi.x + roi.width <= wholeSize.width && - roi.y + roi.height <= wholeSize.height ); - - int esz = (int)getElemSize(srcType); - int bufElemSize = (int)getElemSize(bufType); - const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; - - int _maxBufRows = std::max(ksize.height + 3, - std::max(anchor.y, - ksize.height-anchor.y-1)*2+1); - - if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) - { - rows.resize(_maxBufRows); - maxWidth = std::max(maxWidth, roi.width); - int cn = CV_MAT_CN(srcType); - srcRow.resize(esz*(maxWidth + ksize.width - 1)); - if( columnBorderType == BORDER_CONSTANT ) - { - CV_Assert(constVal != NULL); - constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); - uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; - int n = (int)constBorderValue.size(), N; - N = (maxWidth + ksize.width - 1)*esz; - tdst = isSeparable() ? &srcRow[0] : dst; - - for( i = 0; i < N; i += n ) - { - n = std::min( n, N - i ); - for(j = 0; j < n; j++) - tdst[i+j] = constVal[j]; - } - - if( isSeparable() ) - (*rowFilter)(&srcRow[0], dst, maxWidth, cn); - } - - int maxBufStep = bufElemSize*(int)alignSize(maxWidth + - (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); - } - - // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - - dx1 = std::max(anchor.x - roi.x, 0); - dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); - - // recompute border tables - if( dx1 > 0 || dx2 > 0 ) - { - if( rowBorderType == BORDER_CONSTANT ) - { - CV_Assert(constVal != NULL); - int nr = isSeparable() ? 1 : (int)rows.size(); - for( i = 0; i < nr; i++ ) - { - uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; - memcpy( dst, constVal, dx1*esz ); - memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); - } - } - else - { - int xofs1 = std::min(roi.x, anchor.x) - roi.x; - - int btab_esz = borderElemSize, wholeWidth = wholeSize.width; - int* btab = (int*)&borderTab[0]; - - for( i = 0; i < dx1; i++ ) - { - int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; - for( j = 0; j < btab_esz; j++ ) - btab[i*btab_esz + j] = p0 + j; - } - - for( i = 0; i < dx2; i++ ) - { - int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; - for( j = 0; j < btab_esz; j++ ) - btab[(i + dx1)*btab_esz + j] = p0 + j; - } - } - } - - rowCount = dstY = 0; - startY = startY0 = std::max(roi.y - anchor.y, 0); - endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); - if( columnFilter ) - columnFilter->reset(); - if( filter2D ) - filter2D->reset(); - - return startY; + CV_CPU_DISPATCH(FilterEngine__start, (*this, _wholeSize, sz, ofs), + CV_CPU_DISPATCH_MODES_ALL); } @@ -283,126 +184,33 @@ int FilterEngine::remainingOutputRows() const return roi.height - dstY; } -int FilterEngine::proceed( const uchar* src, int srcstep, int count, - uchar* dst, int dststep ) -{ - CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - - const int *btab = &borderTab[0]; - int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; - uchar** brows = &rows[0]; - int bufRows = (int)rows.size(); - int cn = CV_MAT_CN(bufType); - int width = roi.width, kwidth = ksize.width; - int kheight = ksize.height, ay = anchor.y; - int _dx1 = dx1, _dx2 = dx2; - int width1 = roi.width + kwidth - 1; - int xofs1 = std::min(roi.x, anchor.x); - bool isSep = isSeparable(); - bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; - int dy = 0, i = 0; - - src -= xofs1*esz; - count = std::min(count, remainingInputRows()); - - CV_Assert( src && dst && count > 0 ); - - for(;; dst += dststep*i, dy += i) - { - int dcount = bufRows - ay - startY - rowCount + roi.y; - dcount = dcount > 0 ? dcount : bufRows - kheight + 1; - dcount = std::min(dcount, count); - count -= dcount; - for( ; dcount-- > 0; src += srcstep ) - { - int bi = (startY - startY0 + rowCount) % bufRows; - uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - uchar* row = isSep ? &srcRow[0] : brow; - - if( ++rowCount > bufRows ) - { - --rowCount; - ++startY; - } - - memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); - - if( makeBorder ) - { - if( btab_esz*(int)sizeof(int) == esz ) - { - const int* isrc = (const int*)src; - int* irow = (int*)row; - - for( i = 0; i < _dx1*btab_esz; i++ ) - irow[i] = isrc[btab[i]]; - for( i = 0; i < _dx2*btab_esz; i++ ) - irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]]; - } - else - { - for( i = 0; i < _dx1*esz; i++ ) - row[i] = src[btab[i]]; - for( i = 0; i < _dx2*esz; i++ ) - row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]]; - } - } - - if( isSep ) - (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); - } - - int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); - for( i = 0; i < max_i; i++ ) - { - int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, - wholeSize.height, columnBorderType); - if( srcY < 0 ) // can happen only with constant border type - brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); - else - { - CV_Assert( srcY >= startY ); - if( srcY >= startY + rowCount ) - break; - int bi = (srcY - startY0) % bufRows; - brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - } - } - if( i < kheight ) - break; - i -= kheight - 1; - if( isSeparable() ) - (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); - else - (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); - } - - dstY += dy; - CV_Assert( dstY <= roi.height ); - return dy; -} - -void FilterEngine::apply(const Mat& src, Mat& dst, const Size & wsz, const Point & ofs) +int FilterEngine::proceed(const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { CV_INSTRUMENT_REGION(); - CV_Assert( src.type() == srcType && dst.type() == dstType ); + CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - int y = start(src, wsz, ofs); - proceed(src.ptr() + y*src.step, - (int)src.step, - endY - startY, - dst.ptr(), - (int)dst.step ); + CV_CPU_DISPATCH(FilterEngine__proceed, (*this, src, srcstep, count, dst, dststep), + CV_CPU_DISPATCH_MODES_ALL); } +void FilterEngine::apply(const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) +{ + CV_INSTRUMENT_REGION(); + + CV_CheckTypeEQ(src.type(), srcType, ""); + CV_CheckTypeEQ(dst.type(), dstType, ""); + + CV_CPU_DISPATCH(FilterEngine__apply, (*this, src, dst, wsz, ofs), + CV_CPU_DISPATCH_MODES_ALL); } /****************************************************************************************\ * Separable linear filter * \****************************************************************************************/ -int cv::getKernelType(InputArray filter_kernel, Point anchor) +int getKernelType(InputArray filter_kernel, Point anchor) { Mat _kernel = filter_kernel.getMat(); CV_Assert( _kernel.channels() == 1 ); @@ -439,2626 +247,39 @@ int cv::getKernelType(InputArray filter_kernel, Point anchor) } -namespace cv +Ptr getLinearRowFilter( + int srcType, int bufType, + InputArray _kernel, int anchor, + int symmetryType) { + CV_INSTRUMENT_REGION(); -struct RowNoVec -{ - RowNoVec() {} - RowNoVec(const Mat&) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct ColumnNoVec -{ - ColumnNoVec() {} - ColumnNoVec(const Mat&, int, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - -struct SymmRowSmallNoVec -{ - SymmRowSmallNoVec() {} - SymmRowSmallNoVec(const Mat&, int) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct SymmColumnSmallNoVec -{ - SymmColumnSmallNoVec() {} - SymmColumnSmallNoVec(const Mat&, int, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - -struct FilterNoVec -{ - FilterNoVec() {} - FilterNoVec(const Mat&, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - - -#if CV_SIMD - -///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// - -struct RowVec_8u32s -{ - RowVec_8u32s() { smallValues = false; } - RowVec_8u32s( const Mat& _kernel ) - { - kernel = _kernel; - smallValues = true; - int k, ksize = kernel.rows + kernel.cols - 1; - for( k = 0; k < ksize; k++ ) - { - int v = kernel.ptr()[k]; - if( v < SHRT_MIN || v > SHRT_MAX ) - { - smallValues = false; - break; - } - } - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; - int* dst = (int*)_dst; - const int* _kx = kernel.ptr(); - width *= cn; - - if( smallValues ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const uchar* src = _src + i; - v_int32 s0 = vx_setzero_s32(); - v_int32 s1 = vx_setzero_s32(); - v_int32 s2 = vx_setzero_s32(); - v_int32 s3 = vx_setzero_s32(); - k = 0; - for (; k <= _ksize - 2; k += 2, src += 2 * cn) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint8 x0, x1; - v_zip(vx_load(src), vx_load(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); - } - if (k < _ksize) - { - v_int32 f = vx_setall_s32(_kx[k]); - v_uint16 x0, x1; - v_expand(vx_load(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - const uchar* src = _src + i; - v_int32 s0 = vx_setzero_s32(); - v_int32 s1 = vx_setzero_s32(); - k = 0; - for( ; k <= _ksize - 2; k += 2, src += 2*cn ) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint16 x0, x1; - v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); - } - if( k < _ksize ) - { - v_int32 f = vx_setall_s32(_kx[k]); - v_uint32 x0, x1; - v_expand(vx_load_expand(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 d = vx_setzero_s32(); - k = 0; - const uchar* src = _src + i; - for (; k <= _ksize - 2; k += 2, src += 2*cn) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint32 x0, x1; - v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1); - d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)); - } - if (k < _ksize) - d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))); - v_store(dst + i, d); - i += v_uint32::nlanes; - } - } - vx_cleanup(); - return i; - } - - Mat kernel; - bool smallValues; -}; - - -struct SymmRowSmallVec_8u32s -{ - SymmRowSmallVec_8u32s() { smallValues = false; symmetryType = 0; } - SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) - { - kernel = _kernel; - symmetryType = _symmetryType; - smallValues = true; - int k, ksize = kernel.rows + kernel.cols - 1; - for( k = 0; k < ksize; k++ ) - { - int v = kernel.ptr()[k]; - if( v < SHRT_MIN || v > SHRT_MAX ) - { - smallValues = false; - break; - } - } - } - - int operator()(const uchar* src, uchar* _dst, int width, int cn) const - { - int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; - int* dst = (int*)_dst; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int* kx = kernel.ptr() + _ksize/2; - if( !smallValues ) - return 0; - - src += (_ksize/2)*cn; - width *= cn; - - if( symmetrical ) - { - if( _ksize == 1 ) - return 0; - if( _ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); - x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); - v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn))); - v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_uint32 x = vx_load_expand_q(src); - x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn); - v_store(dst + i, v_reinterpret_as_s32(x)); - i += v_uint32::nlanes; - } - } - else if( kx[0] == -2 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); - x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x); - v_store(dst + i, x); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)kx[0]); - v_int16 k1 = vx_setall_s16((short)kx[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - - v_int32 dl, dh; - v_int16 x0, x1; - v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh); - v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); - - v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh); - v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i + 2*v_int32::nlanes, dl); - v_store(dst + i + 3*v_int32::nlanes, dh); - } - if ( i <= width - v_uint16::nlanes ) - { - v_int32 dl, dh; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh); - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if ( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1]))); - i += v_uint32::nlanes; - } - } - } - else if( _ksize == 5 ) - { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - 2*cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + 2*cn), x2l, x2h); - x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); - x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x); - v_store(dst + i, x); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_int32 x0, x1, x2, x3; - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; - v_int16 xl, xh; - - v_expand(vx_load(src), x0l, x0h); - v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1); - v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3); - - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x1l, x1h); - v_expand(vx_load(src - 2*cn), x2l, x2h); - v_expand(vx_load(src + 2*cn), x3l, x3h); - v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh); - x0 += v_dotprod(xl, k12); - x1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh); - x2 += v_dotprod(xl, k12); - x3 += v_dotprod(xh, k12); - - v_store(dst + i, x0); - v_store(dst + i + v_int32::nlanes, x1); - v_store(dst + i + 2*v_int32::nlanes, x2); - v_store(dst + i + 3*v_int32::nlanes, x3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 x1, x2; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh); - x1 += v_dotprod(xl, k12); - x2 += v_dotprod(xh, k12); - - v_store(dst + i, x1); - v_store(dst + i + v_int32::nlanes, x2); - i += v_uint16::nlanes, src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), - v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2])))); - i += v_uint32::nlanes; - } - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint8 v_src = vx_load(src); - v_int32 s0, s1, s2, s3; - v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); - v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); - for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src - j - cn); - v_uint8 v_src2 = vx_load(src + j); - v_uint8 v_src3 = vx_load(src + j + cn); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k1 = vx_setall_s16((short)(kx[k])); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src + j); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); - s2 += v_dotprod(xl, k1); - s3 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 s0, s1; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); - for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - } - if ( k < _ksize / 2 + 1 ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh); - v_int16 k1 = vx_setall_s16((short)(kx[k])); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); - for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn ) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0); - v_store(dst + i, s0); - i += v_uint32::nlanes; - } - } - } - else - { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x2l, x2h); - v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)); - v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)); - v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh)); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))); - v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if (i <= width - v_uint32::nlanes) - { - v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn))); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x2l, x2h); - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh); - v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); - v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh); - v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if (i <= width - v_uint32::nlanes) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1]))); - i += v_uint32::nlanes; - } - } - } - else if( _ksize == 5 ) - { - v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src - 2*cn), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - v_expand(vx_load(src + 2*cn), x3l, x3h); - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1); - v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); - v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))), - v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1); - v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]), - (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2]))); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint8 v_src = vx_load(src); - v_int32 s0, s1, s2, s3; - v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); - v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); - for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src - j - cn); - v_uint8 v_src2 = vx_load(src + j); - v_uint8 v_src3 = vx_load(src + j + cn); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src + j); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 s0, s1; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); - for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); - for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0); - v_store(dst + i, s0); - i += v_uint32::nlanes; - } - } - } - - vx_cleanup(); - return i; - } - - Mat kernel; - int symmetryType; - bool smallValues; -}; - - -struct SymmColumnVec_32s8u -{ - SymmColumnVec_32s8u() { symmetryType=0; delta = 0; } - SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) - { - symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* dst, int width) const - { - int _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - int ksize2 = _ksize/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - v_float32 f0 = vx_setall_f32(ky[0]); - v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const int* S = src[0] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - const int* S = src[0] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); - s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - } - else - { - v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4); - for ( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); - for ( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta)); - for (k = 2; k <= ksize2; k++) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -struct SymmColumnSmallVec_32s16s -{ - SymmColumnSmallVec_32s16s() { symmetryType=0; delta = 0; } - SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) - { - symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - short* dst = (short*)_dst; - - v_float32 df4 = vx_setall_f32(delta); - int d = cvRound(delta); - v_int16 d8 = vx_setall_s16((short)d); - if( symmetrical ) - { - if( ky[0] == 2 && ky[1] == 1 ) - { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s)); - i += v_int32::nlanes; - } - } - else if( ky[0] == -2 && ky[1] == 1 ) - { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0), - vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s)); - i += v_int32::nlanes; - } - } -#if CV_NEON - else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) - { - v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); - v_int32 d4 = vx_setall_s32(d); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)), - v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4)))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); - i += v_int32::nlanes; - } - } -#endif - else - { - v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4))))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); - i += v_int32::nlanes; - } - } - } - else - { - if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) - { - if( ky[1] < 0 ) - std::swap(S0, S2); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d)); - i += v_int32::nlanes; - } - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4)))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); - i += v_int32::nlanes; - } - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////////////// 16s ////////////////////////////////// - -struct RowVec_16s32f -{ - RowVec_16s32f() {} - RowVec_16s32f( const Mat& _kernel ) - { - kernel = _kernel; - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; - float* dst = (float*)_dst; - const float* _kx = kernel.ptr(); - width *= cn; - - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - v_float32 s1 = vx_setzero_f32(); - v_float32 s2 = vx_setzero_f32(); - v_float32 s3 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - { - v_float32 f = vx_setall_f32(_kx[k]); - v_int16 xl = vx_load(src); - v_int16 xh = vx_load(src + v_int16::nlanes); - s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0); - s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1); - s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2); - s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - v_int16::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - v_float32 s1 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - { - v_float32 f = vx_setall_f32(_kx[k]); - v_int16 x = vx_load(src); - s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0); - s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - vx_cleanup(); - return i; - } - - Mat kernel; -}; - - -struct SymmColumnVec_32f16s -{ - SymmColumnVec_32f16s() { symmetryType=0; delta = 0; } - SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - int ksize2 = _ksize / 2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - short* dst = (short*)_dst; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - v_float32 k0 = vx_setall_f32(ky[0]); - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; - } - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////////////// 32f ////////////////////////////////// - -struct RowVec_32f -{ - RowVec_32f() - { - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; -#if defined USE_IPP_SEP_FILTERS - bufsz = -1; -#endif - } - - RowVec_32f( const Mat& _kernel ) - { - kernel = _kernel; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; -#if defined USE_IPP_SEP_FILTERS - bufsz = -1; -#endif - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { -#if defined USE_IPP_SEP_FILTERS - CV_IPP_CHECK() - { - int ret = ippiOperator(_src, _dst, width, cn); - if (ret > 0) - return ret; - } -#endif - int _ksize = kernel.rows + kernel.cols - 1; - CV_DbgAssert(_ksize > 0); - const float* src0 = (const float*)_src; - float* dst = (float*)_dst; - const float* _kx = kernel.ptr(); - - int i = 0, k; - width *= cn; - -#if CV_TRY_AVX2 - if (haveAVX2) - return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); -#endif - v_float32 k0 = vx_setall_f32(_kx[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; - v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0; - v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - { - v_float32 k1 = vx_setall_f32(_kx[k]); - s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - { - v_float32 k1 = vx_setall_f32(_kx[k]); - s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - vx_cleanup(); - return i; - } - - Mat kernel; - bool haveAVX2; -#if defined USE_IPP_SEP_FILTERS -private: - mutable int bufsz; - int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const - { - CV_INSTRUMENT_REGION_IPP(); - - int _ksize = kernel.rows + kernel.cols - 1; - if ((1 != cn && 3 != cn) || width < _ksize*8) - return 0; - - const float* src = (const float*)_src; - float* dst = (float*)_dst; - const float* _kx = (const float*)kernel.data; - - IppiSize roisz = { width, 1 }; - if( bufsz < 0 ) - { - if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) || - (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0)) - return 0; - } - AutoBuffer buf(bufsz + 64); - uchar* bufptr = alignPtr(buf.data(), 32); - int step = (int)(width*sizeof(dst[0])*cn); - float borderValue[] = {0.f, 0.f, 0.f}; - // here is the trick. IPP needs border type and extrapolates the row. We did it already. - // So we pass anchor=0 and ignore the right tail of results since they are incorrect there. - if( (cn == 1 && CV_INSTRUMENT_FUN_IPP(ippiFilterRowBorderPipeline_32f_C1R, src, step, &dst, roisz, _kx, _ksize, 0, - ippBorderRepl, borderValue[0], bufptr) < 0) || - (cn == 3 && CV_INSTRUMENT_FUN_IPP(ippiFilterRowBorderPipeline_32f_C3R, src, step, &dst, roisz, _kx, _ksize, 0, - ippBorderRepl, borderValue, bufptr) < 0)) - { - setIppErrorStatus(); - return 0; - } - CV_IMPL_ADD(CV_IMPL_IPP); - return width - _ksize + 1; - } -#endif -}; - - -struct SymmRowSmallVec_32f -{ - SymmRowSmallVec_32f() { symmetryType = 0; } - SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) - { - kernel = _kernel; - symmetryType = _symmetryType; - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - float* dst = (float*)_dst; - const float* src = (const float*)_src + (_ksize/2)*cn; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float* kx = kernel.ptr() + _ksize/2; - width *= cn; - - if( symmetrical ) - { - if( _ksize == 3 ) - { - if( fabs(kx[0]) == 2 && kx[1] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(kx[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); -#else - if( kx[0] > 0 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x)); - } - else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)); - } - } - else if( _ksize == 5 ) - { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(-2); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); -#else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1))); - } - } - } - else - { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, vx_load(src + cn) - vx_load(src - cn)); - else - { - v_float32 k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1); - } - } - else if( _ksize == 5 ) - { - v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); - } - } - - vx_cleanup(); - return i; - } - - Mat kernel; - int symmetryType; -}; - - -struct SymmColumnVec_32f -{ - SymmColumnVec_32f() { - symmetryType=0; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; - delta = 0; - } - SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - float* dst = (float*)_dst; - - if( symmetrical ) - { - -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); -#endif - const v_float32 d4 = vx_setall_f32(delta); - const v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); - for( k = 1; k <= ksize2; k++ ) - { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - for( k = 1; k <= ksize2; k++ ) - { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - for( k = 1; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - } - else - { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif - CV_DbgAssert(ksize2 > 0); - const v_float32 d4 = vx_setall_f32(delta); - const v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; - bool haveAVX2; -}; - - -struct SymmColumnSmallVec_32f -{ - SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; } - SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - float* dst = (float*)_dst; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - if( fabs(ky[0]) == 2 && ky[1] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); -#else - if(ky[0] > 0) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - { - v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x)); - } - else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - { - v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); - } - } - else - { - if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) - { - if( ky[1] < 0 ) - std::swap(S0, S2); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////// non-separable filters /////////////////////////////// - -///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// - -struct FilterVec_8u -{ - FilterVec_8u() { delta = 0; _nz = 0; } - FilterVec_8u(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* dst, int width) const - { - CV_DbgAssert(_nz > 0); - const float* kf = (const float*)&coeffs[0]; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - v_uint16 xl, xh; - v_expand(vx_load(src[0] + i), xl, xh); - v_uint32 x0, x1, x2, x3; - v_expand(xl, x0, x1); - v_expand(xh, x2, x3); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load(src[k] + i), xl, xh); - v_expand(xl, x0, x1); - v_expand(xh, x2, x3); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); - s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2); - s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint32 x0, x1; - v_expand(vx_load_expand(src[0] + i), x0, x1); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load_expand(src[k] + i), x0, x1); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta)); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_8u16s -{ - FilterVec_8u16s() { delta = 0; _nz = 0; } - FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* _dst, int width) const - { - CV_DbgAssert(_nz > 0); - const float* kf = (const float*)&coeffs[0]; - short* dst = (short*)_dst; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - v_uint16 xl, xh; - v_expand(vx_load(src[0] + i), xl, xh); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load(src[k] + i), xl, xh); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); - s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2); - s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src[0] + i); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - x = vx_load_expand(src[k] + i); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_int32::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_32f -{ - FilterVec_32f() { delta = 0; _nz = 0; } - FilterVec_32f(const Mat& _kernel, int, double _delta) - { - delta = (float)_delta; - std::vector coords; - preprocess2DKernel(_kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - const float* kf = (const float*)&coeffs[0]; - const float** src = (const float**)_src; - float* dst = (float*)_dst; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f1 = vx_setall_f32(kf[k]); - s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f1 = vx_setall_f32(kf[k]); - s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - -#else - -typedef RowNoVec RowVec_8u32s; -typedef RowNoVec RowVec_16s32f; -typedef RowNoVec RowVec_32f; -typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s; -typedef SymmRowSmallNoVec SymmRowSmallVec_32f; -typedef ColumnNoVec SymmColumnVec_32s8u; -typedef ColumnNoVec SymmColumnVec_32f16s; -typedef ColumnNoVec SymmColumnVec_32f; -typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s; -typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; -typedef FilterNoVec FilterVec_8u; -typedef FilterNoVec FilterVec_8u16s; -typedef FilterNoVec FilterVec_32f; - -#endif - - -template struct RowFilter : public BaseRowFilter -{ - RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() ) - { - if( _kernel.isContinuous() ) - kernel = _kernel; - else - _kernel.copyTo(kernel); - anchor = _anchor; - ksize = kernel.rows + kernel.cols - 1; - CV_Assert( kernel.type() == DataType
::type && - (kernel.rows == 1 || kernel.cols == 1)); - vecOp = _vecOp; - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int _ksize = ksize; - const DT* kx = kernel.ptr
(); - const ST* S; - DT* D = (DT*)dst; - int i, k; - - i = vecOp(src, dst, width, cn); - width *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - S = (const ST*)src + i; - DT f = kx[0]; - DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3]; - - for( k = 1; k < _ksize; k++ ) - { - S += cn; - f = kx[k]; - s0 += f*S[0]; s1 += f*S[1]; - s2 += f*S[2]; s3 += f*S[3]; - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - S = (const ST*)src + i; - DT s0 = kx[0]*S[0]; - for( k = 1; k < _ksize; k++ ) - { - S += cn; - s0 += kx[k]*S[0]; - } - D[i] = s0; - } - } - - Mat kernel; - VecOp vecOp; -}; - - -template struct SymmRowSmallFilter : - public RowFilter -{ - SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType, - const VecOp& _vecOp = VecOp()) - : RowFilter( _kernel, _anchor, _vecOp ) - { - symmetryType = _symmetryType; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 ); - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int ksize2 = this->ksize/2, ksize2n = ksize2*cn; - const DT* kx = this->kernel.template ptr
() + ksize2; - bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; - DT* D = (DT*)dst; - int i = this->vecOp(src, dst, width, cn), j, k; - const ST* S = (const ST*)src + i + ksize2n; - width *= cn; - - if( symmetrical ) - { - if( this->ksize == 1 && kx[0] == 1 ) - { - for( ; i <= width - 2; i += 2 ) - { - DT s0 = S[i], s1 = S[i+1]; - D[i] = s0; D[i+1] = s1; - } - S += i; - } - else if( this->ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn]; - D[i] = s0; D[i+1] = s1; - } - else if( kx[0] == -2 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn]; - D[i] = s0; D[i+1] = s1; - } - else - { - DT k0 = kx[0], k1 = kx[1]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1; - D[i] = s0; D[i+1] = s1; - } - } - } - else if( this->ksize == 5 ) - { - DT k0 = kx[0], k1 = kx[1], k2 = kx[2]; - if( k0 == -2 && k1 == 0 && k2 == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = -2*S[0] + S[-cn*2] + S[cn*2]; - DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2]; - D[i] = s0; D[i+1] = s1; - } - else - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2; - DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2; - D[i] = s0; D[i+1] = s1; - } - } - - for( ; i < width; i++, S++ ) - { - DT s0 = kx[0]*S[0]; - for( k = 1, j = cn; k <= ksize2; k++, j += cn ) - s0 += kx[k]*(S[j] + S[-j]); - D[i] = s0; - } - } - else - { - if( this->ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn]; - D[i] = s0; D[i+1] = s1; - } - else - { - DT k1 = kx[1]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1; - D[i] = s0; D[i+1] = s1; - } - } - } - else if( this->ksize == 5 ) - { - DT k1 = kx[1], k2 = kx[2]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2; - DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2; - D[i] = s0; D[i+1] = s1; - } - } - - for( ; i < width; i++, S++ ) - { - DT s0 = kx[0]*S[0]; - for( k = 1, j = cn; k <= ksize2; k++, j += cn ) - s0 += kx[k]*(S[j] - S[-j]); - D[i] = s0; - } - } - } - - int symmetryType; -}; - - -template struct ColumnFilter : public BaseColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - ColumnFilter( const Mat& _kernel, int _anchor, - double _delta, const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp() ) - { - if( _kernel.isContinuous() ) - kernel = _kernel; - else - _kernel.copyTo(kernel); - anchor = _anchor; - ksize = kernel.rows + kernel.cols - 1; - delta = saturate_cast(_delta); - castOp0 = _castOp; - vecOp = _vecOp; - CV_Assert( kernel.type() == DataType::type && - (kernel.rows == 1 || kernel.cols == 1)); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - const ST* ky = kernel.template ptr(); - ST _delta = delta; - int _ksize = ksize; - int i, k; - CastOp castOp = castOp0; - - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = vecOp(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST* S = (const ST*)src[0] + i; - ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, - s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; - - for( k = 1; k < _ksize; k++ ) - { - S = (const ST*)src[k] + i; f = ky[k]; - s0 += f*S[0]; s1 += f*S[1]; - s2 += f*S[2]; s3 += f*S[3]; - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; - for( k = 1; k < _ksize; k++ ) - s0 += ky[k]*((const ST*)src[k])[i]; - D[i] = castOp(s0); - } - } - } - - Mat kernel; - CastOp castOp0; - VecOp vecOp; - ST delta; -}; - - -template struct SymmColumnFilter : public ColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - SymmColumnFilter( const Mat& _kernel, int _anchor, - double _delta, int _symmetryType, - const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp()) - : ColumnFilter( _kernel, _anchor, _delta, _castOp, _vecOp ) - { - symmetryType = _symmetryType; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int ksize2 = this->ksize/2; - const ST* ky = this->kernel.template ptr() + ksize2; - int i, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - ST _delta = this->delta; - CastOp castOp = this->castOp0; - src += ksize2; - - if( symmetrical ) - { - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = (this->vecOp)(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST* S = (const ST*)src[0] + i, *S2; - ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, - s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; - - for( k = 1; k <= ksize2; k++ ) - { - S = (const ST*)src[k] + i; - S2 = (const ST*)src[-k] + i; - f = ky[k]; - s0 += f*(S[0] + S2[0]); - s1 += f*(S[1] + S2[1]); - s2 += f*(S[2] + S2[2]); - s3 += f*(S[3] + S2[3]); - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; - for( k = 1; k <= ksize2; k++ ) - s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]); - D[i] = castOp(s0); - } - } - } - else - { - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = this->vecOp(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST *S, *S2; - ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; - - for( k = 1; k <= ksize2; k++ ) - { - S = (const ST*)src[k] + i; - S2 = (const ST*)src[-k] + i; - f = ky[k]; - s0 += f*(S[0] - S2[0]); - s1 += f*(S[1] - S2[1]); - s2 += f*(S[2] - S2[2]); - s3 += f*(S[3] - S2[3]); - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = _delta; - for( k = 1; k <= ksize2; k++ ) - s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]); - D[i] = castOp(s0); - } - } - } - } - - int symmetryType; -}; - - -template -struct SymmColumnSmallFilter : public SymmColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - SymmColumnSmallFilter( const Mat& _kernel, int _anchor, - double _delta, int _symmetryType, - const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp()) - : SymmColumnFilter( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp ) - { - CV_Assert( this->ksize == 3 ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int ksize2 = this->ksize/2; - const ST* ky = this->kernel.template ptr() + ksize2; - int i; - bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; - bool is_1_2_1 = ky[0] == 2 && ky[1] == 1; - bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1; - bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1); - ST f0 = ky[0], f1 = ky[1]; - ST _delta = this->delta; - CastOp castOp = this->castOp0; - src += ksize2; - - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = (this->vecOp)(src, dst, width); - const ST* S0 = (const ST*)src[-1]; - const ST* S1 = (const ST*)src[0]; - const ST* S2 = (const ST*)src[1]; - - if( symmetrical ) - { - if( is_1_2_1 ) - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; - ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta; - s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; - D[i] = castOp(s0); - } - } - else if( is_1_m2_1 ) - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; - ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta; - s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; - D[i] = castOp(s0); - } - } - else - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; - ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta; - s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; - D[i] = castOp(s0); - } - } - } - else - { - if( is_m1_0_1 ) - { - if( f1 < 0 ) - std::swap(S0, S2); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S2[i] - S0[i] + _delta; - ST s1 = S2[i+1] - S0[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S2[i+2] - S0[i+2] + _delta; - s1 = S2[i+3] - S0[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S2[i] - S0[i] + _delta; - D[i] = castOp(s0); - } - if( f1 < 0 ) - std::swap(S0, S2); - } - else - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = (S2[i] - S0[i])*f1 + _delta; - ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = (S2[i+2] - S0[i+2])*f1 + _delta; - s1 = (S2[i+3] - S0[i+3])*f1 + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i++ ) - D[i] = castOp((S2[i] - S0[i])*f1 + _delta); - } - } - } - } -}; - -template struct Cast -{ - typedef ST type1; - typedef DT rtype; - - DT operator()(ST val) const { return saturate_cast
(val); } -}; - -template struct FixedPtCast -{ - typedef ST type1; - typedef DT rtype; - enum { SHIFT = bits, DELTA = 1 << (bits-1) }; - - DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } -}; - -template struct FixedPtCastEx -{ - typedef ST type1; - typedef DT rtype; - - FixedPtCastEx() : SHIFT(0), DELTA(0) {} - FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {} - DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } - int SHIFT, DELTA; -}; - -} - -cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, - InputArray _kernel, int anchor, - int symmetryType ) -{ - Mat kernel = _kernel.getMat(); - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); - int cn = CV_MAT_CN(srcType); - CV_Assert( cn == CV_MAT_CN(bufType) && - ddepth >= std::max(sdepth, CV_32S) && - kernel.type() == ddepth ); - int ksize = kernel.rows + kernel.cols - 1; - - if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 ) - { - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr > - (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)); - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)); - } - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr > - (kernel, anchor, RowVec_8u32s(kernel)); - if( sdepth == CV_8U && ddepth == CV_32F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16U && ddepth == CV_32F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16S && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, RowVec_16s32f(kernel)); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, RowVec_32f(kernel)); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, bufType)); + Mat kernelMat = _kernel.getMat(); + CV_CPU_DISPATCH(getLinearRowFilter, (srcType, bufType, kernelMat, anchor, symmetryType), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getLinearColumnFilter( int bufType, int dstType, - InputArray _kernel, int anchor, - int symmetryType, double delta, - int bits ) +Ptr getLinearColumnFilter( + int bufType, int dstType, + InputArray kernel, int anchor, + int symmetryType, double delta, + int bits) { - Mat kernel = _kernel.getMat(); - int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); - int cn = CV_MAT_CN(dstType); - CV_Assert( cn == CV_MAT_CN(bufType) && - sdepth >= std::max(ddepth, CV_32S) && - kernel.type() == sdepth ); + CV_INSTRUMENT_REGION(); - if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) ) - { - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, FixedPtCastEx(bits)); - if( ddepth == CV_8U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16S && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - } - else - { - int ksize = kernel.rows + kernel.cols - 1; - if( ksize == 3 ) - { - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, SymmColumnVec_32s8u> > - (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), - SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); - if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 ) - return makePtr, - SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType, - Cast(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr,SymmColumnSmallVec_32f> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)); - } - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, SymmColumnVec_32s8u> > - (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), - SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); - if( ddepth == CV_8U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16S && sdepth == CV_32S ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16S && sdepth == CV_32F ) - return makePtr, SymmColumnVec_32f16s> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr, SymmColumnVec_32f> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnVec_32f(kernel, symmetryType, 0, delta)); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - } - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of buffer format (=%d), and destination format (=%d)", - bufType, dstType)); + Mat kernelMat = kernel.getMat(); + CV_CPU_DISPATCH(getLinearColumnFilter, (bufType, dstType, kernelMat, anchor, symmetryType, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createSeparableLinearFilter( - int _srcType, int _dstType, - InputArray __rowKernel, InputArray __columnKernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createSeparableLinearFilter( + int _srcType, int _dstType, + InputArray __rowKernel, InputArray __columnKernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); _srcType = CV_MAT_TYPE(_srcType); @@ -3124,9 +345,6 @@ cv::Ptr cv::createSeparableLinearFilter( * Non-separable linear filter * \****************************************************************************************/ -namespace cv -{ - void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) { int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); @@ -3729,89 +947,25 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, #endif -} - -cv::Ptr cv::getLinearFilter(int srcType, int dstType, - InputArray filter_kernel, Point anchor, - double delta, int bits) +Ptr getLinearFilter( + int srcType, int dstType, + InputArray filter_kernel, Point anchor, + double delta, int bits) { - Mat _kernel = filter_kernel.getMat(); - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); - int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); - CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); + CV_INSTRUMENT_REGION(); - anchor = normalizeAnchor(anchor, _kernel.size()); - - /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S ) - return makePtr, FilterVec_8u> > - (_kernel, anchor, delta, FixedPtCastEx(bits), - FilterVec_8u(_kernel, bits, delta)); - if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S ) - return makePtr, FilterVec_8u16s> > - (_kernel, anchor, delta, FixedPtCastEx(bits), - FilterVec_8u16s(_kernel, bits, delta));*/ - - kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F; - Mat kernel; - if( _kernel.type() == kdepth ) - kernel = _kernel; - else - _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.); - - if( sdepth == CV_8U && ddepth == CV_8U ) - return makePtr, FilterVec_8u> > - (kernel, anchor, delta, Cast(), FilterVec_8u(kernel, 0, delta)); - if( sdepth == CV_8U && ddepth == CV_16U ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_8U && ddepth == CV_16S ) - return makePtr, FilterVec_8u16s> > - (kernel, anchor, delta, Cast(), FilterVec_8u16s(kernel, 0, delta)); - if( sdepth == CV_8U && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_16U && ddepth == CV_16U ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16U && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_16S && ddepth == CV_16S ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16S && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr, FilterVec_32f> > - (kernel, anchor, delta, Cast(), FilterVec_32f(kernel, 0, delta)); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and destination format (=%d)", - srcType, dstType)); + Mat kernelMat = filter_kernel.getMat(); + CV_CPU_DISPATCH(getLinearFilter, (srcType, dstType, kernelMat, anchor, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, - InputArray filter_kernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createLinearFilter( + int _srcType, int _dstType, + InputArray filter_kernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat _kernel = filter_kernel.getMat(); _srcType = CV_MAT_TYPE(_srcType); @@ -3844,8 +998,6 @@ cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, // HAL interface //================================================================ -using namespace cv; - static bool replacementFilter2D(int stype, int dtype, int kernel_type, uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, @@ -4083,7 +1235,6 @@ static void ocvSepFilter(int stype, int dtype, int ktype, // HAL functions //=================================================================== -namespace cv { namespace hal { @@ -4191,16 +1342,15 @@ void sepFilter2D(int stype, int dtype, int ktype, anchor_x, anchor_y, delta, borderType); } -} // cv::hal:: -} // cv:: +} // namespace cv::hal:: //================================================================ // Main interface //================================================================ -void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor0, - double delta, int borderType ) +void filter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernel, Point anchor0, + double delta, int borderType) { CV_INSTRUMENT_REGION(); @@ -4229,9 +1379,9 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, delta, borderType, src.isSubmatrix()); } -void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) +void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType) { CV_INSTRUMENT_REGION(); @@ -4266,6 +1416,7 @@ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); } +} // namespace CV_IMPL void cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 93f3f177e6..198c8c336c 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -56,6 +56,8 @@ namespace cv InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ); #endif + + void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); } #endif diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 43200218dc..48675152fa 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -41,160 +41,85 @@ //M*/ #include "precomp.hpp" -#include "opencv2/core/opencl/ocl_defs.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" +#if defined(CV_CPU_BASELINE_MODE) +#if IPP_VERSION_X100 >= 710 +#define USE_IPP_SEP_FILTERS 1 +#else +#undef USE_IPP_SEP_FILTERS +#endif +#endif + /****************************************************************************************\ Base Image Filter \****************************************************************************************/ -#if IPP_VERSION_X100 >= 710 -#define USE_IPP_SEP_FILTERS 1 -#else -#undef USE_IPP_SEP_FILTERS -#endif +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs); +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep); +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs); -namespace cv -{ +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType); -BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } -BaseRowFilter::~BaseRowFilter() {} +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits); -BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } -BaseColumnFilter::~BaseColumnFilter() {} -void BaseColumnFilter::reset() {} - -BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } -BaseFilter::~BaseFilter() {} -void BaseFilter::reset() {} - -FilterEngine::FilterEngine() - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ -} +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& filter_kernel, Point anchor, + double delta, int bits); -FilterEngine::FilterEngine( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ - init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, - _rowBorderType, _columnBorderType, _borderValue); -} - -FilterEngine::~FilterEngine() -{ -} - - -void FilterEngine::init( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - _srcType = CV_MAT_TYPE(_srcType); - _bufType = CV_MAT_TYPE(_bufType); - _dstType = CV_MAT_TYPE(_dstType); - - srcType = _srcType; - int srcElemSize = (int)getElemSize(srcType); - dstType = _dstType; - bufType = _bufType; - - filter2D = _filter2D; - rowFilter = _rowFilter; - columnFilter = _columnFilter; - - if( _columnBorderType < 0 ) - _columnBorderType = _rowBorderType; - - rowBorderType = _rowBorderType; - columnBorderType = _columnBorderType; - - CV_Assert( columnBorderType != BORDER_WRAP ); - - if( isSeparable() ) - { - CV_Assert( rowFilter && columnFilter ); - ksize = Size(rowFilter->ksize, columnFilter->ksize); - anchor = Point(rowFilter->anchor, columnFilter->anchor); - } - else - { - CV_Assert( bufType == srcType ); - ksize = filter2D->ksize; - anchor = filter2D->anchor; - } - - CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && - 0 <= anchor.y && anchor.y < ksize.height ); - - borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); - int borderLength = std::max(ksize.width - 1, 1); - borderTab.resize(borderLength*borderElemSize); - - maxWidth = bufStep = 0; - constBorderRow.clear(); - - if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) - { - constBorderValue.resize(srcElemSize*borderLength); - int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4)); - scalarToRawData(_borderValue, &constBorderValue[0], srcType1, - borderLength*CV_MAT_CN(srcType)); - } - - wholeSize = Size(-1,-1); -} +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #define VEC_ALIGN CV_MALLOC_ALIGN -int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs) +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs) { + CV_INSTRUMENT_REGION(); + int i, j; - wholeSize = _wholeSize; - roi = Rect(ofs, sz); - CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && - roi.x + roi.width <= wholeSize.width && - roi.y + roi.height <= wholeSize.height ); + this_.wholeSize = _wholeSize; + this_.roi = Rect(ofs, sz); + CV_Assert( this_.roi.x >= 0 && this_.roi.y >= 0 && this_.roi.width >= 0 && this_.roi.height >= 0 && + this_.roi.x + this_.roi.width <= this_.wholeSize.width && + this_.roi.y + this_.roi.height <= this_.wholeSize.height ); - int esz = (int)getElemSize(srcType); - int bufElemSize = (int)getElemSize(bufType); - const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; + int esz = (int)getElemSize(this_.srcType); + int bufElemSize = (int)getElemSize(this_.bufType); + const uchar* constVal = !this_.constBorderValue.empty() ? &this_.constBorderValue[0] : 0; - int _maxBufRows = std::max(ksize.height + 3, - std::max(anchor.y, - ksize.height-anchor.y-1)*2+1); + int _maxBufRows = std::max(this_.ksize.height + 3, + std::max(this_.anchor.y, + this_.ksize.height-this_.anchor.y-1)*2+1); - if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) + if (this_.maxWidth < this_.roi.width || _maxBufRows != (int)this_.rows.size() ) { - rows.resize(_maxBufRows); - maxWidth = std::max(maxWidth, roi.width); - int cn = CV_MAT_CN(srcType); - srcRow.resize(esz*(maxWidth + ksize.width - 1)); - if( columnBorderType == BORDER_CONSTANT ) + this_.rows.resize(_maxBufRows); + this_.maxWidth = std::max(this_.maxWidth, this_.roi.width); + int cn = CV_MAT_CN(this_.srcType); + this_.srcRow.resize(esz*(this_.maxWidth + this_.ksize.width - 1)); + if (this_.columnBorderType == BORDER_CONSTANT) { CV_Assert(constVal != NULL); - constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); - uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; - int n = (int)constBorderValue.size(), N; - N = (maxWidth + ksize.width - 1)*esz; - tdst = isSeparable() ? &srcRow[0] : dst; + this_.constBorderRow.resize(getElemSize(this_.bufType)*(this_.maxWidth + this_.ksize.width - 1 + VEC_ALIGN)); + uchar *dst = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); + int n = (int)this_.constBorderValue.size(); + int N = (this_.maxWidth + this_.ksize.width - 1)*esz; + uchar *tdst = this_.isSeparable() ? &this_.srcRow[0] : dst; for( i = 0; i < N; i += n ) { @@ -203,126 +128,113 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs tdst[i+j] = constVal[j]; } - if( isSeparable() ) - (*rowFilter)(&srcRow[0], dst, maxWidth, cn); + if (this_.isSeparable()) + (*this_.rowFilter)(&this_.srcRow[0], dst, this_.maxWidth, cn); } - int maxBufStep = bufElemSize*(int)alignSize(maxWidth + - (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); + int maxBufStep = bufElemSize*(int)alignSize(this_.maxWidth + + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); + this_.ringBuf.resize(maxBufStep*this_.rows.size()+VEC_ALIGN); } // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); + this_.bufStep = bufElemSize*(int)alignSize(this_.roi.width + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); - dx1 = std::max(anchor.x - roi.x, 0); - dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); + this_.dx1 = std::max(this_.anchor.x - this_.roi.x, 0); + this_.dx2 = std::max(this_.ksize.width - this_.anchor.x - 1 + this_.roi.x + this_.roi.width - this_.wholeSize.width, 0); // recompute border tables - if( dx1 > 0 || dx2 > 0 ) + if (this_.dx1 > 0 || this_.dx2 > 0) { - if( rowBorderType == BORDER_CONSTANT ) + if (this_.rowBorderType == BORDER_CONSTANT ) { CV_Assert(constVal != NULL); - int nr = isSeparable() ? 1 : (int)rows.size(); + int nr = this_.isSeparable() ? 1 : (int)this_.rows.size(); for( i = 0; i < nr; i++ ) { - uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; - memcpy( dst, constVal, dx1*esz ); - memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); + uchar* dst = this_.isSeparable() ? &this_.srcRow[0] : alignPtr(&this_.ringBuf[0], VEC_ALIGN) + this_.bufStep*i; + memcpy(dst, constVal, this_.dx1*esz); + memcpy(dst + (this_.roi.width + this_.ksize.width - 1 - this_.dx2)*esz, constVal, this_.dx2*esz); } } else { - int xofs1 = std::min(roi.x, anchor.x) - roi.x; + int xofs1 = std::min(this_.roi.x, this_.anchor.x) - this_.roi.x; - int btab_esz = borderElemSize, wholeWidth = wholeSize.width; - int* btab = (int*)&borderTab[0]; + int btab_esz = this_.borderElemSize, wholeWidth = this_.wholeSize.width; + int* btab = (int*)&this_.borderTab[0]; - for( i = 0; i < dx1; i++ ) + for( i = 0; i < this_.dx1; i++ ) { - int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(i-this_.dx1, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) btab[i*btab_esz + j] = p0 + j; } - for( i = 0; i < dx2; i++ ) + for( i = 0; i < this_.dx2; i++ ) { - int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) - btab[(i + dx1)*btab_esz + j] = p0 + j; + btab[(i + this_.dx1)*btab_esz + j] = p0 + j; } } } - rowCount = dstY = 0; - startY = startY0 = std::max(roi.y - anchor.y, 0); - endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); - if( columnFilter ) - columnFilter->reset(); - if( filter2D ) - filter2D->reset(); + this_.rowCount = this_.dstY = 0; + this_.startY = this_.startY0 = std::max(this_.roi.y - this_.anchor.y, 0); + this_.endY = std::min(this_.roi.y + this_.roi.height + this_.ksize.height - this_.anchor.y - 1, this_.wholeSize.height); - return startY; + if (this_.columnFilter) + this_.columnFilter->reset(); + if (this_.filter2D) + this_.filter2D->reset(); + + return this_.startY; } -int FilterEngine::start(const Mat& src, const Size &wsz, const Point &ofs) +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { - start( wsz, src.size(), ofs); - return startY - ofs.y; -} + CV_INSTRUMENT_REGION(); -int FilterEngine::remainingInputRows() const -{ - return endY - startY - rowCount; -} + CV_DbgAssert(this_.wholeSize.width > 0 && this_.wholeSize.height > 0 ); -int FilterEngine::remainingOutputRows() const -{ - return roi.height - dstY; -} - -int FilterEngine::proceed( const uchar* src, int srcstep, int count, - uchar* dst, int dststep ) -{ - CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - - const int *btab = &borderTab[0]; - int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; - uchar** brows = &rows[0]; - int bufRows = (int)rows.size(); - int cn = CV_MAT_CN(bufType); - int width = roi.width, kwidth = ksize.width; - int kheight = ksize.height, ay = anchor.y; - int _dx1 = dx1, _dx2 = dx2; - int width1 = roi.width + kwidth - 1; - int xofs1 = std::min(roi.x, anchor.x); - bool isSep = isSeparable(); - bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; + const int *btab = &this_.borderTab[0]; + int esz = (int)getElemSize(this_.srcType), btab_esz = this_.borderElemSize; + uchar** brows = &this_.rows[0]; + int bufRows = (int)this_.rows.size(); + int cn = CV_MAT_CN(this_.bufType); + int width = this_.roi.width, kwidth = this_.ksize.width; + int kheight = this_.ksize.height, ay = this_.anchor.y; + int _dx1 = this_.dx1, _dx2 = this_.dx2; + int width1 = this_.roi.width + kwidth - 1; + int xofs1 = std::min(this_.roi.x, this_.anchor.x); + bool isSep = this_.isSeparable(); + bool makeBorder = (_dx1 > 0 || _dx2 > 0) && this_.rowBorderType != BORDER_CONSTANT; int dy = 0, i = 0; src -= xofs1*esz; - count = std::min(count, remainingInputRows()); + count = std::min(count, this_.remainingInputRows()); - CV_Assert( src && dst && count > 0 ); + CV_Assert(src && dst && count > 0); for(;; dst += dststep*i, dy += i) { - int dcount = bufRows - ay - startY - rowCount + roi.y; + int dcount = bufRows - ay - this_.startY - this_.rowCount + this_.roi.y; dcount = dcount > 0 ? dcount : bufRows - kheight + 1; dcount = std::min(dcount, count); count -= dcount; for( ; dcount-- > 0; src += srcstep ) { - int bi = (startY - startY0 + rowCount) % bufRows; - uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - uchar* row = isSep ? &srcRow[0] : brow; + int bi = (this_.startY - this_.startY0 + this_.rowCount) % bufRows; + uchar* brow = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; + uchar* row = isSep ? &this_.srcRow[0] : brow; - if( ++rowCount > bufRows ) + if (++this_.rowCount > bufRows) { - --rowCount; - ++startY; + --this_.rowCount; + ++this_.startY; } memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); @@ -349,99 +261,55 @@ int FilterEngine::proceed( const uchar* src, int srcstep, int count, } if( isSep ) - (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); + (*this_.rowFilter)(row, brow, width, CV_MAT_CN(this_.srcType)); } - int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); + int max_i = std::min(bufRows, this_.roi.height - (this_.dstY + dy) + (kheight - 1)); for( i = 0; i < max_i; i++ ) { - int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, - wholeSize.height, columnBorderType); + int srcY = borderInterpolate(this_.dstY + dy + i + this_.roi.y - ay, + this_.wholeSize.height, this_.columnBorderType); if( srcY < 0 ) // can happen only with constant border type - brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); + brows[i] = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); else { - CV_Assert( srcY >= startY ); - if( srcY >= startY + rowCount ) + CV_Assert(srcY >= this_.startY); + if( srcY >= this_.startY + this_.rowCount) break; - int bi = (srcY - startY0) % bufRows; - brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; + int bi = (srcY - this_.startY0) % bufRows; + brows[i] = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; } } if( i < kheight ) break; i -= kheight - 1; - if( isSeparable() ) - (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); + if (isSep) + (*this_.columnFilter)((const uchar**)brows, dst, dststep, i, this_.roi.width*cn); else - (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); + (*this_.filter2D)((const uchar**)brows, dst, dststep, i, this_.roi.width, cn); } - dstY += dy; - CV_Assert( dstY <= roi.height ); + this_.dstY += dy; + CV_Assert(this_.dstY <= this_.roi.height); return dy; } -void FilterEngine::apply(const Mat& src, Mat& dst, const Size & wsz, const Point & ofs) +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) { CV_INSTRUMENT_REGION(); - CV_Assert( src.type() == srcType && dst.type() == dstType ); + CV_DbgAssert(src.type() == this_.srcType && dst.type() == this_.dstType); - int y = start(src, wsz, ofs); - proceed(src.ptr() + y*src.step, + FilterEngine__start(this_, wsz, src.size(), ofs); + int y = this_.startY - ofs.y; + FilterEngine__proceed(this_, + src.ptr() + y*src.step, (int)src.step, - endY - startY, + this_.endY - this_.startY, dst.ptr(), (int)dst.step ); } -} - -/****************************************************************************************\ -* Separable linear filter * -\****************************************************************************************/ - -int cv::getKernelType(InputArray filter_kernel, Point anchor) -{ - Mat _kernel = filter_kernel.getMat(); - CV_Assert( _kernel.channels() == 1 ); - int i, sz = _kernel.rows*_kernel.cols; - - Mat kernel; - _kernel.convertTo(kernel, CV_64F); - - const double* coeffs = kernel.ptr(); - double sum = 0; - int type = KERNEL_SMOOTH + KERNEL_INTEGER; - if( (_kernel.rows == 1 || _kernel.cols == 1) && - anchor.x*2 + 1 == _kernel.cols && - anchor.y*2 + 1 == _kernel.rows ) - type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); - - for( i = 0; i < sz; i++ ) - { - double a = coeffs[i], b = coeffs[sz - i - 1]; - if( a != b ) - type &= ~KERNEL_SYMMETRICAL; - if( a != -b ) - type &= ~KERNEL_ASYMMETRICAL; - if( a < 0 ) - type &= ~KERNEL_SMOOTH; - if( a != saturate_cast(a) ) - type &= ~KERNEL_INTEGER; - sum += a; - } - - if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) - type &= ~KERNEL_SMOOTH; - return type; -} - - -namespace cv -{ - struct RowNoVec { RowNoVec() {} @@ -503,6 +371,8 @@ struct RowVec_8u32s int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; const int* _kx = kernel.ptr(); @@ -587,7 +457,6 @@ struct RowVec_8u32s i += v_uint32::nlanes; } } - vx_cleanup(); return i; } @@ -618,6 +487,8 @@ struct SymmRowSmallVec_8u32s int operator()(const uchar* src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -1083,8 +954,6 @@ struct SymmRowSmallVec_8u32s } } } - - vx_cleanup(); return i; } @@ -1107,6 +976,8 @@ struct SymmColumnVec_32s8u int operator()(const uchar** _src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1237,8 +1108,6 @@ struct SymmColumnVec_32s8u i += v_int32x4::nlanes; } } - - vx_cleanup(); return i; } @@ -1261,6 +1130,8 @@ struct SymmColumnSmallVec_32s16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -1420,8 +1291,6 @@ struct SymmColumnSmallVec_32s16s } } } - - vx_cleanup(); return i; } @@ -1443,6 +1312,8 @@ struct RowVec_16s32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; float* dst = (float*)_dst; const float* _kx = kernel.ptr(); @@ -1495,7 +1366,6 @@ struct RowVec_16s32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } @@ -1516,6 +1386,8 @@ struct SymmColumnVec_32f16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1620,7 +1492,6 @@ struct SymmColumnVec_32f16s } } - vx_cleanup(); return i; } @@ -1653,6 +1524,8 @@ struct RowVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + #if defined USE_IPP_SEP_FILTERS CV_IPP_CHECK() { @@ -1722,7 +1595,6 @@ struct RowVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } @@ -1782,6 +1654,8 @@ struct SymmRowSmallVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1868,8 +1742,6 @@ struct SymmRowSmallVec_32f v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); } } - - vx_cleanup(); return i; } @@ -1896,6 +1768,8 @@ struct SymmColumnVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; @@ -2005,8 +1879,6 @@ struct SymmColumnVec_32f i += v_float32::nlanes; } } - - vx_cleanup(); return i; } @@ -2030,6 +1902,8 @@ struct SymmColumnSmallVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -2085,8 +1959,6 @@ struct SymmColumnSmallVec_32f v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); } } - - vx_cleanup(); return i; } @@ -2115,6 +1987,8 @@ struct FilterVec_8u int operator()(const uchar** src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; int i = 0, k, nz = _nz; @@ -2175,8 +2049,6 @@ struct FilterVec_8u *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); i += v_int32x4::nlanes; } - - vx_cleanup(); return i; } @@ -2201,6 +2073,8 @@ struct FilterVec_8u16s int operator()(const uchar** src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; short* dst = (short*)_dst; @@ -2251,8 +2125,6 @@ struct FilterVec_8u16s v_pack_store(dst + i, v_round(s0)); i += v_int32::nlanes; } - - vx_cleanup(); return i; } @@ -2275,6 +2147,8 @@ struct FilterVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + const float* kf = (const float*)&coeffs[0]; const float** src = (const float**)_src; float* dst = (float*)_dst; @@ -2323,8 +2197,6 @@ struct FilterVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - - vx_cleanup(); return i; } @@ -2369,6 +2241,8 @@ template struct RowFilter : public BaseRo void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int _ksize = ksize; const DT* kx = kernel.ptr
(); const ST* S; @@ -2427,6 +2301,8 @@ template struct SymmRowSmallFilter : void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2, ksize2n = ksize2*cn; const DT* kx = this->kernel.template ptr
() + ksize2; bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -2566,6 +2442,8 @@ template struct ColumnFilter : public BaseColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const ST* ky = kernel.template ptr(); ST _delta = delta; int _ksize = ksize; @@ -2629,6 +2507,8 @@ template struct SymmColumnFilter : public ColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i, k; @@ -2735,6 +2615,8 @@ struct SymmColumnSmallFilter : public SymmColumnFilter void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i; @@ -2904,13 +2786,14 @@ template struct FixedPtCastEx int SHIFT, DELTA; }; -} -cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, - InputArray _kernel, int anchor, - int symmetryType ) +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); int cn = CV_MAT_CN(srcType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -2958,12 +2841,14 @@ cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, } -cv::Ptr cv::getLinearColumnFilter( int bufType, int dstType, - InputArray _kernel, int anchor, - int symmetryType, double delta, - int bits ) +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(dstType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -3053,131 +2938,6 @@ cv::Ptr cv::getLinearColumnFilter( int bufType, int dstTyp } -cv::Ptr cv::createSeparableLinearFilter( - int _srcType, int _dstType, - InputArray __rowKernel, InputArray __columnKernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - int rsize = _rowKernel.rows + _rowKernel.cols - 1; - int csize = _columnKernel.rows + _columnKernel.cols - 1; - if( _anchor.x < 0 ) - _anchor.x = rsize/2; - if( _anchor.y < 0 ) - _anchor.y = csize/2; - int rtype = getKernelType(_rowKernel, - _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); - int ctype = getKernelType(_columnKernel, - _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); - Mat rowKernel, columnKernel; - - int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); - int bits = 0; - - if( sdepth == CV_8U && - ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ddepth == CV_8U) || - ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (rtype & ctype & KERNEL_INTEGER) && - ddepth == CV_16S)) ) - { - bdepth = CV_32S; - bits = ddepth == CV_8U ? 8 : 0; - _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); - _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); - bits *= 2; - _delta *= (1 << bits); - } - else - { - if( _rowKernel.type() != bdepth ) - _rowKernel.convertTo( rowKernel, bdepth ); - else - rowKernel = _rowKernel; - if( _columnKernel.type() != bdepth ) - _columnKernel.convertTo( columnKernel, bdepth ); - else - columnKernel = _columnKernel; - } - - int _bufType = CV_MAKETYPE(bdepth, cn); - Ptr _rowFilter = getLinearRowFilter( - _srcType, _bufType, rowKernel, _anchor.x, rtype); - Ptr _columnFilter = getLinearColumnFilter( - _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); - - return Ptr( new FilterEngine(Ptr(), _rowFilter, _columnFilter, - _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); -} - - -/****************************************************************************************\ -* Non-separable linear filter * -\****************************************************************************************/ - -namespace cv -{ - -void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) -{ - int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); - if(nz == 0) - nz = 1; - CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); - coords.resize(nz); - coeffs.resize(nz*getElemSize(ktype)); - uchar* _coeffs = &coeffs[0]; - - for( i = k = 0; i < kernel.rows; i++ ) - { - const uchar* krow = kernel.ptr(i); - for( j = 0; j < kernel.cols; j++ ) - { - if( ktype == CV_8U ) - { - uchar val = krow[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - _coeffs[k++] = val; - } - else if( ktype == CV_32S ) - { - int val = ((const int*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((int*)_coeffs)[k++] = val; - } - else if( ktype == CV_32F ) - { - float val = ((const float*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((float*)_coeffs)[k++] = val; - } - else - { - double val = ((const double*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((double*)_coeffs)[k++] = val; - } - } - } -} - template struct Filter2D : public BaseFilter { @@ -3253,489 +3013,14 @@ template struct Filter2D : public BaseFi VecOp vecOp; }; -#ifdef HAVE_OPENCL -#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -// prepare kernel: transpose and make double rows (+align). Returns size of aligned row -// Samples: -// a b c -// Input: d e f -// g h i -// Output, last two zeros is the alignment: -// a d g a d g 0 0 -// b e h b e h 0 0 -// c f i c f i 0 0 -template -static int _prepareKernelFilter2D(std::vector & data, const Mat & kernel) +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& _kernel, Point anchor, + double delta, int bits) { - Mat _kernel; kernel.convertTo(_kernel, DataDepth::value); - int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); - data.clear(); data.resize(size_y_aligned * kernel.cols, 0); - for (int x = 0; x < kernel.cols; x++) - { - for (int y = 0; y < kernel.rows; y++) - { - data[x * size_y_aligned + y] = _kernel.at(y, x); - data[x * size_y_aligned + y + kernel.rows] = _kernel.at(y, x); - } - } - return size_y_aligned; -} + CV_INSTRUMENT_REGION(); -static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor, - double delta, int borderType ) -{ - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - ddepth = ddepth < 0 ? sdepth : ddepth; - int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), - wtype = CV_MAKE_TYPE(wdepth, cn); - if (cn > 4) - return false; - - Size ksize = _kernel.size(); - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - const cv::ocl::Device &device = cv::ocl::Device::getDefault(); - bool doubleSupport = device.doubleFPConfig() > 0; - if (wdepth == CV_64F && !doubleSupport) - return false; - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", - "BORDER_WRAP", "BORDER_REFLECT_101" }; - - cv::Mat kernelMat = _kernel.getMat(); - cv::Size sz = _src.size(), wholeSize; - size_t globalsize[2] = { (size_t)sz.width, (size_t)sz.height }; - size_t localsize_general[2] = {0, 1}; - size_t* localsize = NULL; - - ocl::Kernel k; - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - size_t tryWorkItems = device.maxWorkGroupSize(); - if (device.isIntel() && 128 < tryWorkItems) - tryWorkItems = 128; - char cvt[2][40]; - - // For smaller filter kernels, there is a special kernel that is more - // efficient than the general one. - UMat kernalDataUMat; - if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && - ((ksize.width < 5 && ksize.height < 5) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - kernelMat = kernelMat.reshape(0, 1); - String kerStr = ocl::kernelToStr(kernelMat, CV_32F); - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1; - int pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = sz.width % 2 ? 1 : 2; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - globalsize[0] = sz.width / pxPerWorkItemX; - globalsize[1] = sz.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - char build_options[1024]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s %s", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); - - if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - std::vector kernelMatDataFloat; - int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); - String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); - - for ( ; ; ) - { - size_t BLOCK_SIZE = tryWorkItems; - while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) - BLOCK_SIZE /= 2; - - if ((size_t)ksize.width > BLOCK_SIZE) - return false; - - int requiredTop = anchor.y; - int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; - - if ((w < ksize.width) || (h < ksize.height)) - return false; - - String opts = format("-D LOCAL_SIZE=%d -D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s", - (int)BLOCK_SIZE, cn, anchor.x, anchor.y, - ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); - - localsize[0] = BLOCK_SIZE; - globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; - globalsize[1] = sz.height; - - if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) - return false; - - size_t kernelWorkGroupSize = k.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE < kernelWorkGroupSize) - return false; - tryWorkItems = kernelWorkGroupSize; - } - } - - _dst.create(sz, dtype); - UMat dst = _dst.getUMat(); - - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width); - int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY, - srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta); - - return k.run(2, globalsize, localsize, false); -} - -const int shift_bits = 8; - -static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1, bool int_arithm) -{ - int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - Size bufSize = buf.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = {16, 10}; -#else - size_t localsize[2] = {16, 16}; -#endif - - size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; - if (fast8uc1) - globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; - - int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, - * const btype = borderMap[borderType & ~BORDER_ISOLATED]; - - bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1; - extra_extrapolation |= src.rows < radiusY; - extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; - extra_extrapolation |= src.cols < radiusX; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", - radiusX, (int)localsize[0], (int)localsize[1], cn, btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - ocl::typeToStr(type), ocl::typeToStr(buf_type), - ocl::convertTypeStr(sdepth, bdepth, cn, cvt), - ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelX, bdepth); - - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - String kernelName("row_filter"); - if (fast8uc1) - kernelName += "_C1_D0"; - - ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, - build_options); - if (k.empty()) - return false; - - if (fast8uc1) - k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), - buf.cols, buf.rows, radiusY); - else - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); - - return k.run(2, globalsize, localsize, false); -} - -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) -{ - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (dst.depth() == CV_64F && !doubleSupport) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = { 16, 10 }; -#else - size_t localsize[2] = { 16, 16 }; -#endif - size_t globalsize[2] = { 0, 0 }; - - int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); - Size sz = dst.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; - globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s" - " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", - anchor, (int)localsize[0], (int)localsize[1], cn, - ocl::typeToStr(buf_type), ocl::typeToStr(dtype), - ocl::convertTypeStr(bdepth, ddepth, cn, cvt), - ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), - 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelY, bdepth); - - ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, - build_options); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, globalsize, localsize, false); -} - -const int optimizedSepFilterLocalWidth = 16; -const int optimizedSepFilterLocalHeight = 8; - -static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth, int bdepth, bool int_arithm) -{ - Size size = _src.size(), wholeSize; - Point origin; - int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), - dtype = CV_MAKE_TYPE(ddepth, cn); - size_t src_step = _src.step(), src_offset = _src.offset(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - - if (esz == 0 || src_step == 0 - || (src_offset % src_step) % esz != 0 - || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - || !(borderType == BORDER_CONSTANT - || borderType == BORDER_REPLICATE - || borderType == BORDER_REFLECT - || borderType == BORDER_WRAP - || borderType == BORDER_REFLECT_101)) - return false; - - size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; - - char cvt[2][40]; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", - "BORDER_REFLECT_101" }; - - String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" - " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", - (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), - ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), - cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - - ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, dtype); - UMat dst = _dst.getUMat(); - - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); - - src.locateROI(wholeSize, origin); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, - wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, gt2, lt2, false); -} - -bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - const ocl::Device & d = ocl::Device::getDefault(); - Size imgSize = _src.size(); - - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - if (cn > 4) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = kernelX.cols >> 1; - if (anchor.y < 0) - anchor.y = kernelY.cols >> 1; - - int rtype = getKernelType(kernelX, - kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); - int ctype = getKernelType(kernelY, - kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); - - int bdepth = CV_32F; - bool int_arithm = false; - if( sdepth == CV_8U && ddepth == CV_8U && - rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) - { - if (ocl::Device::getDefault().isIntel()) - { - for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); - if (kernelX.data != kernelY.data) - for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); - } else - { - bdepth = CV_32S; - kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); - kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); - } - int_arithm = true; - } - - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalWidth + anchor.x && - imgSize.height > optimizedSepFilterLocalHeight + anchor.y && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && - anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && - OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) - - UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; - - Size srcSize = src.size(); - Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); - UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) - return false; - - _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); -} - -#endif - -} - -cv::Ptr cv::getLinearFilter(int srcType, int dstType, - InputArray filter_kernel, Point anchor, - double delta, int bits) -{ - Mat _kernel = filter_kernel.getMat(); int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); @@ -3806,476 +3091,6 @@ cv::Ptr cv::getLinearFilter(int srcType, int dstType, srcType, dstType)); } - -cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, - InputArray filter_kernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _kernel = filter_kernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - - Mat kernel = _kernel; - int bits = 0; - - /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); - if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && - _kernel.rows*_kernel.cols <= (1 << 10) ) - { - bits = (ktype & KERNEL_INTEGER) ? 0 : 11; - _kernel.convertTo(kernel, CV_32S, 1 << bits); - }*/ - - Ptr _filter2D = getLinearFilter(_srcType, _dstType, - kernel, _anchor, _delta, bits); - - return makePtr(_filter2D, Ptr(), - Ptr(), _srcType, _dstType, _srcType, - _rowBorderType, _columnBorderType, _borderValue ); -} - - -//================================================================ -// HAL interface -//================================================================ - -using namespace cv; - -static bool replacementFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, bool isSubmatrix) -{ - cvhalFilter2D* ctx; - int res = cv_hal_filterInit(&ctx, kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, width, height, - stype, dtype, borderType, delta, anchor_x, anchor_y, isSubmatrix, src_data == dst_data); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_filter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_filterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -#ifdef HAVE_IPP -static bool ippFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - ::ipp::IwiSize iwSize(width, height); - ::ipp::IwiSize kernelSize(kernel_width, kernel_height); - IppDataType type = ippiGetDataType(CV_MAT_DEPTH(stype)); - int channels = CV_MAT_CN(stype); - - CV_UNUSED(isSubmatrix); - -#if IPP_VERSION_X100 >= 201700 && IPP_VERSION_X100 <= 201702 // IPP bug with 1x1 kernel - if(kernel_width == 1 && kernel_height == 1) - return false; #endif - -#if IPP_DISABLE_FILTER2D_BIG_MASK - // Too big difference compared to OpenCV FFT-based convolution - if(kernel_type == CV_32FC1 && (type == ipp16s || type == ipp16u) && (kernel_width > 7 || kernel_height > 7)) - return false; - - // Poor optimization for big kernels - if(kernel_width > 7 || kernel_height > 7) - return false; -#endif - - if(src_data == dst_data) - return false; - - if(stype != dtype) - return false; - - if(kernel_type != CV_16SC1 && kernel_type != CV_32FC1) - return false; - - // TODO: Implement offset for 8u, 16u - if(std::fabs(delta) >= DBL_EPSILON) - return false; - - if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) - return false; - - try - { - ::ipp::IwiBorderSize iwBorderSize; - ::ipp::IwiBorderType iwBorderType; - ::ipp::IwiImage iwKernel(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, (void*)kernel_data, kernel_step); - ::ipp::IwiImage iwSrc(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)src_data, src_step); - ::ipp::IwiImage iwDst(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)dst_data, dst_step); - - iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); - if(!iwBorderType) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilter, iwSrc, iwDst, iwKernel, ::ipp::IwiFilterParams(1, 0, ippAlgHintNone, ippRndFinancial), iwBorderType); - } - catch(const ::ipp::IwException& ex) - { - CV_UNUSED(ex); - return false; - } - - return true; -#else - CV_UNUSED(stype); CV_UNUSED(dtype); CV_UNUSED(kernel_type); CV_UNUSED(src_data); CV_UNUSED(src_step); - CV_UNUSED(dst_data); CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(full_width); - CV_UNUSED(full_height); CV_UNUSED(offset_x); CV_UNUSED(offset_y); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); - CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(delta); - CV_UNUSED(borderType); CV_UNUSED(isSubmatrix); - return false; -#endif -} -#endif - -static bool dftFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - { - int sdepth = CV_MAT_DEPTH(stype); - int ddepth = CV_MAT_DEPTH(dtype); - int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50; - if (kernel_width * kernel_height < dft_filter_size) - return false; - } - - Point anchor = Point(anchor_x, anchor_y); - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - - Mat src(Size(full_width-offset_x, full_height-offset_y), stype, src_data, src_step); - Mat dst(Size(full_width, full_height), dtype, dst_data, dst_step); - Mat temp; - int src_channels = CV_MAT_CN(stype); - int dst_channels = CV_MAT_CN(dtype); - int ddepth = CV_MAT_DEPTH(dtype); - // crossCorr doesn't accept non-zero delta with multiple channels - if (src_channels != 1 && delta != 0) { - // The semantics of filter2D require that the delta be applied - // as floating-point math. So wee need an intermediate Mat - // with a float datatype. If the dest is already floats, - // we just use that. - int corrDepth = ddepth; - if ((ddepth == CV_32F || ddepth == CV_64F) && src_data != dst_data) { - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - } else { - corrDepth = ddepth == CV_64F ? CV_64F : CV_32F; - temp.create(Size(full_width, full_height), CV_MAKETYPE(corrDepth, dst_channels)); - } - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(corrDepth, src_channels), - anchor, 0, borderType); - add(temp, delta, temp); - if (temp.data != dst_data) { - temp.convertTo(dst, dst.type()); - } - } else { - if (src_data != dst_data) - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - else - temp.create(Size(full_width, full_height), dtype); - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(ddepth, src_channels), - anchor, delta, borderType); - if (temp.data != dst_data) - temp.copyTo(dst); - } - return true; -} - -static void ocvFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - int borderTypeValue = borderType & ~BORDER_ISOLATED; - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - Ptr f = createLinearFilter(stype, dtype, kernel, Point(anchor_x, anchor_y), delta, - borderTypeValue); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -} - -static bool replacementSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - cvhalFilter2D *ctx; - int res = cv_hal_sepFilterInit(&ctx, stype, dtype, ktype, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_sepFilter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_sepFilterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -static void ocvSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - Mat kernelX(Size(kernelx_len, 1), ktype, kernelx_data); - Mat kernelY(Size(kernely_len, 1), ktype, kernely_data); - Ptr f = createSeparableLinearFilter(stype, dtype, kernelX, kernelY, - Point(anchor_x, anchor_y), - delta, borderType & ~BORDER_ISOLATED); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -}; - -//=================================================================== -// HAL functions -//=================================================================== - -namespace cv { -namespace hal { - - -CV_DEPRECATED Ptr Filter2D::create(uchar * , size_t , int , - int , int , - int , int , - int , int , - int , double , - int , int , - bool , bool ) { return Ptr(); } - -CV_DEPRECATED Ptr SepFilter2D::create(int , int , int , - uchar * , int , - uchar * , int , - int , int , - double , int ) { return Ptr(); } - - -void filter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ - bool res; - res = replacementFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix); - if (res) - return; - - CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix)) - - res = dftFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); - if (res) - return; - ocvFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); -} - -//--------------------------------------------------------------- - -void sepFilter2D(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - - bool res = replacementSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res) - return; - ocvSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); -} - -} // cv::hal:: -} // cv:: - -//================================================================ -// Main interface -//================================================================ - -void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor0, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType)) - - Mat src = _src.getMat(), kernel = _kernel.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - Point anchor = normalizeAnchor(anchor0, kernel.size()); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - hal::filter2D(src.type(), dst.type(), kernel.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - kernel.data, kernel.step, kernel.cols, kernel.rows, - anchor.x, anchor.y, - delta, borderType, src.isSubmatrix()); -} - -void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), - ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) - - Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - CV_Assert( kernelX.type() == kernelY.type() && - (kernelX.cols == 1 || kernelX.rows == 1) && - (kernelY.cols == 1 || kernelY.rows == 1) ); - - Mat contKernelX = kernelX.isContinuous() ? kernelX : kernelX.clone(); - Mat contKernelY = kernelY.isContinuous() ? kernelY : kernelY.clone(); - - hal::sepFilter2D(src.type(), dst.type(), kernelX.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - contKernelX.data, kernelX.cols + kernelX.rows - 1, - contKernelY.data, kernelY.cols + kernelY.rows - 1, - anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); -} - - -CV_IMPL void -cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - cv::Mat kernel = cv::cvarrToMat(_kernel); - - CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); - - cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 8546ac3ce6bd955de51c743f9ca87b0f27a15f12 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 12:57:48 +0000 Subject: [PATCH 4/9] imgproc: get rid of filter.avx2.cpp --- modules/imgproc/src/filter.avx2.cpp | 197 ---------------------------- modules/imgproc/src/filter.hpp | 6 - modules/imgproc/src/filter.simd.hpp | 115 +++++++++++++--- 3 files changed, 99 insertions(+), 219 deletions(-) delete mode 100644 modules/imgproc/src/filter.avx2.cpp diff --git a/modules/imgproc/src/filter.avx2.cpp b/modules/imgproc/src/filter.avx2.cpp deleted file mode 100644 index e9ced20e36..0000000000 --- a/modules/imgproc/src/filter.avx2.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include "filter.hpp" - -namespace cv -{ - -int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) -{ - int i = 0, k; - for (; i <= width - 8; i += 8) - { - const float* src = src0 + i; - __m256 f, x0; - __m256 s0 = _mm256_set1_ps(0.0f); - for (k = 0; k < _ksize; k++, src += cn) - { - f = _mm256_set1_ps(_kx[k]); - x0 = _mm256_loadu_ps(src); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - } - _mm256_storeu_ps(dst + i, s0); - } - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S, *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m256 f = _mm256_set1_ps(ky[0]); - __m256 s0, s1; - __m256 x0; - S = src[0] + i; - s0 = _mm256_loadu_ps(S); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(s0, f, d8); -#else - s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); -#endif - s1 = _mm256_loadu_ps(S+8); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(s1, f, d8); -#else - s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); -#endif - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 f = _mm_set1_ps(ky[0]); - __m128 x0, s0 = _mm_load_ps(src[0] + i); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - - for( k = 1; k <= ksize2; k++ ) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for (; i <= width - 16; i += 16) - { - __m256 f, s0 = d8, s1 = d8; - __m256 x0; - - for (k = 1; k <= ksize2; k++) - { - const float *S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for (; i <= width - 4; i += 4) - { - __m128 f, x0, s0 = d4; - - for (k = 1; k <= ksize2; k++) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -} - -/* End of file. */ diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 198c8c336c..7b792d1935 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -45,12 +45,6 @@ namespace cv { -#if CV_TRY_AVX2 - int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize); - int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); - int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); -#endif - #ifdef HAVE_OPENCL bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 48675152fa..f09cd1ec1d 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -1507,7 +1507,6 @@ struct RowVec_32f { RowVec_32f() { - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1516,7 +1515,6 @@ struct RowVec_32f RowVec_32f( const Mat& _kernel ) { kernel = _kernel; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1543,9 +1541,24 @@ struct RowVec_32f int i = 0, k; width *= cn; -#if CV_TRY_AVX2 - if (haveAVX2) - return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); +#if CV_AVX + for (; i <= width - 8; i += 8) + { + const float* src = src0 + i; + __m256 f, x0; + __m256 s0 = _mm256_set1_ps(0.0f); + for (k = 0; k < _ksize; k++, src += cn) + { + f = _mm256_set1_ps(_kx[k]); + x0 = _mm256_loadu_ps(src); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + } + _mm256_storeu_ps(dst + i, s0); + } #endif v_float32 k0 = vx_setall_f32(_kx[0]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -1599,7 +1612,6 @@ struct RowVec_32f } Mat kernel; - bool haveAVX2; #if defined USE_IPP_SEP_FILTERS private: mutable int bufsz; @@ -1754,7 +1766,6 @@ struct SymmColumnVec_32f { SymmColumnVec_32f() { symmetryType=0; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; delta = 0; } SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) @@ -1762,7 +1773,6 @@ struct SymmColumnVec_32f symmetryType = _symmetryType; kernel = _kernel; delta = (float)_delta; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } @@ -1780,9 +1790,53 @@ struct SymmColumnVec_32f if( symmetrical ) { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); +#if CV_AVX + { + const float *S, *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m256 f = _mm256_set1_ps(ky[0]); + __m256 s0, s1; + __m256 x0; + S = src[0] + i; + s0 = _mm256_loadu_ps(S); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(s0, f, d8); +#else + s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); +#endif + s1 = _mm256_loadu_ps(S+8); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(s1, f, d8); +#else + s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); +#endif + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } #endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k0 = vx_setall_f32(ky[0]); @@ -1830,11 +1884,41 @@ struct SymmColumnVec_32f } else { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif CV_DbgAssert(ksize2 > 0); +#if CV_AVX + { + const float *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for (; i <= width - 16; i += 16) + { + __m256 f, s0 = d8, s1 = d8; + __m256 x0; + + for (k = 1; k <= ksize2; k++) + { + const float *S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } +#endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k1 = vx_setall_f32(ky[1]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -1885,7 +1969,6 @@ struct SymmColumnVec_32f int symmetryType; float delta; Mat kernel; - bool haveAVX2; }; From 6ec08f268f90c39747d9ee0126821761e9b9ad31 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 18:55:42 +0000 Subject: [PATCH 5/9] imgproc: dispatch medianBlur --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/median_blur.dispatch.cpp | 937 +------------------ modules/imgproc/src/median_blur.simd.hpp | 288 +----- 3 files changed, 29 insertions(+), 1197 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d3afe151bd..c149edb9b3 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -4,4 +4,5 @@ ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/median_blur.dispatch.cpp b/modules/imgproc/src/median_blur.dispatch.cpp index c98cd9215a..d993fbad5b 100644 --- a/modules/imgproc/src/median_blur.dispatch.cpp +++ b/modules/imgproc/src/median_blur.dispatch.cpp @@ -50,895 +50,10 @@ #include "opencv2/core/openvx/ovx_defs.hpp" -/* - * This file includes the code, contributed by Simon Perreault - * (the function icvMedianBlur_8u_O1) - * - * Constant-time median filtering -- http://nomis80.org/ctmf.html - * Copyright (C) 2006 Simon Perreault - * - * Contact: - * Laboratoire de vision et systemes numeriques - * Pavillon Adrien-Pouliot - * Universite Laval - * Sainte-Foy, Quebec, Canada - * G1K 7P4 - * - * perreaul@gel.ulaval.ca - */ +#include "median_blur.simd.hpp" +#include "median_blur.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -/****************************************************************************************\ - Median Filter -\****************************************************************************************/ - -namespace cv -{ - -static void -medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) -{ - typedef ushort HT; - - /** - * This structure represents a two-tier histogram. The first tier (known as the - * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level) - * is 8 bit wide. Pixels inserted in the fine level also get inserted into the - * coarse bucket designated by the 4 MSBs of the fine bucket value. - * - * The structure is aligned on 16 bits, which is a prerequisite for SIMD - * instructions. Each bucket is 16 bit wide, which means that extra care must be - * taken to prevent overflow. - */ - typedef struct - { - HT coarse[16]; - HT fine[16][16]; - } Histogram; - -/** - * HOP is short for Histogram OPeration. This macro makes an operation \a op on - * histogram \a h for pixel value \a x. It takes care of handling both levels. - */ -#define HOP(h,x,op) \ - h.coarse[x>>4] op, \ - *((HT*)h.fine + x) op - -#define COP(c,j,x,op) \ - h_coarse[ 16*(n*c+j) + (x>>4) ] op, \ - h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op - - int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2; - CV_Assert(cn > 0 && cn <= 4); - size_t sstep = _src.step, dstep = _dst.step; - - int STRIPE_SIZE = std::min( _dst.cols, 512/cn ); - -#if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16 -# define CV_ALIGNMENT CV_SIMD_WIDTH -#else -# define CV_ALIGNMENT 16 -#endif - - std::vector _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); - std::vector _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); - HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT); - HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT); - - for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) - { - int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2; - const uchar* src = _src.ptr() + x*cn; - uchar* dst = _dst.ptr() + (x - r)*cn; - - memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) ); - memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) ); - - // First row initialization - for( c = 0; c < cn; c++ ) - { - for( j = 0; j < n; j++ ) - COP( c, j, src[cn*j+c], += (HT)(r+2) ); - - for( i = 1; i < r; i++ ) - { - const uchar* p = src + sstep*std::min(i, m-1); - for ( j = 0; j < n; j++ ) - COP( c, j, p[cn*j+c], ++ ); - } - } - - for( i = 0; i < m; i++ ) - { - const uchar* p0 = src + sstep * std::max( 0, i-r-1 ); - const uchar* p1 = src + sstep * std::min( m-1, i+r ); - - for( c = 0; c < cn; c++ ) - { - Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H; - HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16]; - - memset(&H, 0, sizeof(H)); - memset(luc, 0, sizeof(luc)); - - // Update column histograms for the entire row. - for( j = 0; j < n; j++ ) - { - COP( c, j, p0[j*cn + c], -- ); - COP( c, j, p1[j*cn + c], ++ ); - } - - // First column initialization - for (k = 0; k < 16; ++k) - { -#if CV_SIMD256 - v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k])); -#elif CV_SIMD128 - v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k])); - v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8)); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); -#endif - } - -#if CV_SIMD256 - v_uint16x16 v_coarse = v256_load(H.coarse); -#elif CV_SIMD128 - v_uint16x8 v_coarsel = v_load(H.coarse); - v_uint16x8 v_coarseh = v_load(H.coarse + 8); -#endif - HT* px = h_coarse + 16 * n*c; - for( j = 0; j < 2*r; ++j, px += 16 ) - { -#if CV_SIMD256 - v_coarse += v256_load(px); -#elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] += px[ind]; -#endif - } - - for( j = r; j < n-r; j++ ) - { - int t = 2*r*r + 2*r, b, sum = 0; - HT* segment; - - px = h_coarse + 16 * (n*c + std::min(j + r, n - 1)); -#if CV_SIMD256 - v_coarse += v256_load(px); - v_store(H.coarse, v_coarse); -#elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); - v_store(H.coarse, v_coarsel); - v_store(H.coarse + 8, v_coarseh); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] += px[ind]; -#endif - - // Find median at coarse level - for ( k = 0; k < 16 ; ++k ) - { - sum += H.coarse[k]; - if ( sum > t ) - { - sum -= H.coarse[k]; - break; - } - } - CV_Assert( k < 16 ); - - /* Update corresponding histogram segment */ -#if CV_SIMD256 - v_uint16x16 v_fine; -#elif CV_SIMD128 - v_uint16x8 v_finel; - v_uint16x8 v_fineh; -#endif - if ( luc[k] <= j-r ) - { -#if CV_SIMD256 - v_fine = v256_setzero_u16(); -#elif CV_SIMD128 - v_finel = v_setzero_u16(); - v_fineh = v_setzero_u16(); -#else - memset(&H.fine[k], 0, 16 * sizeof(HT)); -#endif - px = h_fine + 16 * (n*(16 * c + k) + j - r); - for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16) - { -#if CV_SIMD256 - v_fine += v256_load(px); -#elif CV_SIMD128 - v_finel += v_load(px); - v_fineh += v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] += px[ind]; -#endif - } - - if ( luc[k] < j+r+1 ) - { - px = h_fine + 16 * (n*(16 * c + k) + (n - 1)); -#if CV_SIMD256 - v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)); -#elif CV_SIMD128 - v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))); - v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]); -#endif - luc[k] = (HT)(j+r+1); - } - } - else - { -#if CV_SIMD256 - v_fine = v256_load(H.fine[k]); -#elif CV_SIMD128 - v_finel = v_load(H.fine[k]); - v_fineh = v_load(H.fine[k] + 8); -#endif - px = h_fine + 16*n*(16 * c + k); - for ( ; luc[k] < j+r+1; ++luc[k] ) - { -#if CV_SIMD256 - v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); -#elif CV_SIMD128 - v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); - v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind]; -#endif - } - } - - px = h_coarse + 16 * (n*c + MAX(j - r, 0)); -#if CV_SIMD256 - v_store(H.fine[k], v_fine); - v_coarse -= v256_load(px); -#elif CV_SIMD128 - v_store(H.fine[k], v_finel); - v_store(H.fine[k] + 8, v_fineh); - v_coarsel -= v_load(px); - v_coarseh -= v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] -= px[ind]; -#endif - - /* Find median in segment */ - segment = H.fine[k]; - for ( b = 0; b < 16 ; b++ ) - { - sum += segment[b]; - if ( sum > t ) - { - dst[dstep*i+cn*j+c] = (uchar)(16*k + b); - break; - } - } - CV_Assert( b < 16 ); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -#undef HOP -#undef COP -} - -static void -medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) -{ - #define N 16 - int zone0[4][N]; - int zone1[4][N*N]; - int x, y; - int n2 = m*m/2; - Size size = _dst.size(); - const uchar* src = _src.ptr(); - uchar* dst = _dst.ptr(); - int src_step = (int)_src.step, dst_step = (int)_dst.step; - int cn = _src.channels(); - const uchar* src_max = src + size.height*src_step; - CV_Assert(cn > 0 && cn <= 4); - - #define UPDATE_ACC01( pix, cn, op ) \ - { \ - int p = (pix); \ - zone1[cn][p] op; \ - zone0[cn][p >> 4] op; \ - } - - //CV_Assert( size.height >= nx && size.width >= nx ); - for( x = 0; x < size.width; x++, src += cn, dst += cn ) - { - uchar* dst_cur = dst; - const uchar* src_top = src; - const uchar* src_bottom = src; - int k, c; - int src_step1 = src_step, dst_step1 = dst_step; - - if( x % 2 != 0 ) - { - src_bottom = src_top += src_step*(size.height-1); - dst_cur += dst_step*(size.height-1); - src_step1 = -src_step1; - dst_step1 = -dst_step1; - } - - // init accumulator - memset( zone0, 0, sizeof(zone0[0])*cn ); - memset( zone1, 0, sizeof(zone1[0])*cn ); - - for( y = 0; y <= m/2; y++ ) - { - for( c = 0; c < cn; c++ ) - { - if( y > 0 ) - { - for( k = 0; k < m*cn; k += cn ) - UPDATE_ACC01( src_bottom[k+c], c, ++ ); - } - else - { - for( k = 0; k < m*cn; k += cn ) - UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 ); - } - } - - if( (src_step1 > 0 && y < size.height-1) || - (src_step1 < 0 && size.height-y-1 > 0) ) - src_bottom += src_step1; - } - - for( y = 0; y < size.height; y++, dst_cur += dst_step1 ) - { - // find median - for( c = 0; c < cn; c++ ) - { - int s = 0; - for( k = 0; ; k++ ) - { - int t = s + zone0[c][k]; - if( t > n2 ) break; - s = t; - } - - for( k *= N; ;k++ ) - { - s += zone1[c][k]; - if( s > n2 ) break; - } - - dst_cur[c] = (uchar)k; - } - - if( y+1 == size.height ) - break; - - if( cn == 1 ) - { - for( k = 0; k < m; k++ ) - { - int p = src_top[k]; - int q = src_bottom[k]; - zone1[0][p]--; - zone0[0][p>>4]--; - zone1[0][q]++; - zone0[0][q>>4]++; - } - } - else if( cn == 3 ) - { - for( k = 0; k < m*3; k += 3 ) - { - UPDATE_ACC01( src_top[k], 0, -- ); - UPDATE_ACC01( src_top[k+1], 1, -- ); - UPDATE_ACC01( src_top[k+2], 2, -- ); - - UPDATE_ACC01( src_bottom[k], 0, ++ ); - UPDATE_ACC01( src_bottom[k+1], 1, ++ ); - UPDATE_ACC01( src_bottom[k+2], 2, ++ ); - } - } - else - { - assert( cn == 4 ); - for( k = 0; k < m*4; k += 4 ) - { - UPDATE_ACC01( src_top[k], 0, -- ); - UPDATE_ACC01( src_top[k+1], 1, -- ); - UPDATE_ACC01( src_top[k+2], 2, -- ); - UPDATE_ACC01( src_top[k+3], 3, -- ); - - UPDATE_ACC01( src_bottom[k], 0, ++ ); - UPDATE_ACC01( src_bottom[k+1], 1, ++ ); - UPDATE_ACC01( src_bottom[k+2], 2, ++ ); - UPDATE_ACC01( src_bottom[k+3], 3, ++ ); - } - } - - if( (src_step1 > 0 && src_bottom + src_step1 < src_max) || - (src_step1 < 0 && src_bottom + src_step1 >= src) ) - src_bottom += src_step1; - - if( y >= m/2 ) - src_top += src_step1; - } - } -#undef N -#undef UPDATE_ACC -} - - -namespace { - -struct MinMax8u -{ - typedef uchar value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const uchar* ptr) { return *ptr; } - void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; } - void operator()(arg_type& a, arg_type& b) const - { - int t = CV_FAST_CAST_8U(a - b); - b += t; a -= t; - } -}; - -struct MinMax16u -{ - typedef ushort value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const ushort* ptr) { return *ptr; } - void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -struct MinMax16s -{ - typedef short value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const short* ptr) { return *ptr; } - void store(short* ptr, arg_type val) { *ptr = (short)val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -struct MinMax32f -{ - typedef float value_type; - typedef float arg_type; - enum { SIZE = 1 }; - arg_type load(const float* ptr) { return *ptr; } - void store(float* ptr, arg_type val) { *ptr = val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -#if CV_SIMD - -struct MinMaxVec8u -{ - typedef uchar value_type; - typedef v_uint8x16 arg_type; - enum { SIZE = v_uint8x16::nlanes }; - arg_type load(const uchar* ptr) { return v_load(ptr); } - void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_uint8 warg_type; - enum { WSIZE = v_uint8::nlanes }; - warg_type wload(const uchar* ptr) { return vx_load(ptr); } - void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec16u -{ - typedef ushort value_type; - typedef v_uint16x8 arg_type; - enum { SIZE = v_uint16x8::nlanes }; - arg_type load(const ushort* ptr) { return v_load(ptr); } - void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_uint16 warg_type; - enum { WSIZE = v_uint16::nlanes }; - warg_type wload(const ushort* ptr) { return vx_load(ptr); } - void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec16s -{ - typedef short value_type; - typedef v_int16x8 arg_type; - enum { SIZE = v_int16x8::nlanes }; - arg_type load(const short* ptr) { return v_load(ptr); } - void store(short* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_int16 warg_type; - enum { WSIZE = v_int16::nlanes }; - warg_type wload(const short* ptr) { return vx_load(ptr); } - void store(short* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec32f -{ - typedef float value_type; - typedef v_float32x4 arg_type; - enum { SIZE = v_float32x4::nlanes }; - arg_type load(const float* ptr) { return v_load(ptr); } - void store(float* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_float32 warg_type; - enum { WSIZE = v_float32::nlanes }; - warg_type wload(const float* ptr) { return vx_load(ptr); } - void store(float* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - -#else - -typedef MinMax8u MinMaxVec8u; -typedef MinMax16u MinMaxVec16u; -typedef MinMax16s MinMaxVec16s; -typedef MinMax32f MinMaxVec32f; - -#endif - -template -static void -medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) -{ - typedef typename Op::value_type T; - typedef typename Op::arg_type WT; - typedef typename VecOp::arg_type VT; -#if CV_SIMD_WIDTH > 16 - typedef typename VecOp::warg_type WVT; -#endif - - const T* src = _src.ptr(); - T* dst = _dst.ptr(); - int sstep = (int)(_src.step/sizeof(T)); - int dstep = (int)(_dst.step/sizeof(T)); - Size size = _dst.size(); - int i, j, k, cn = _src.channels(); - Op op; - VecOp vop; - - if( m == 3 ) - { - if( size.width == 1 || size.height == 1 ) - { - int len = size.width + size.height - 1; - int sdelta = size.height == 1 ? cn : sstep; - int sdelta0 = size.height == 1 ? 0 : sstep - cn; - int ddelta = size.height == 1 ? cn : dstep; - - for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) - for( j = 0; j < cn; j++, src++ ) - { - WT p0 = src[i > 0 ? -sdelta : 0]; - WT p1 = src[0]; - WT p2 = src[i < len - 1 ? sdelta : 0]; - - op(p0, p1); op(p1, p2); op(p0, p1); - dst[j] = (T)p1; - } - return; - } - - size.width *= cn; - for( i = 0; i < size.height; i++, dst += dstep ) - { - const T* row0 = src + std::max(i - 1, 0)*sstep; - const T* row1 = src + i*sstep; - const T* row2 = src + std::min(i + 1, size.height-1)*sstep; - int limit = cn; - - for(j = 0;; ) - { - for( ; j < limit; j++ ) - { - int j0 = j >= cn ? j - cn : j; - int j2 = j < size.width - cn ? j + cn : j; - WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2]; - WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2]; - WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2]; - - op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); - op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); - op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); - op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); - op(p4, p2); op(p6, p4); op(p4, p2); - dst[j] = (T)p4; - } - - if( limit == size.width ) - break; - -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE ) - { - WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn); - WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn); - WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - vop.store(dst+j, p4); - } -#endif - for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE ) - { - VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); - VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); - VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - vop.store(dst+j, p4); - } - - limit = size.width; - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - else if( m == 5 ) - { - if( size.width == 1 || size.height == 1 ) - { - int len = size.width + size.height - 1; - int sdelta = size.height == 1 ? cn : sstep; - int sdelta0 = size.height == 1 ? 0 : sstep - cn; - int ddelta = size.height == 1 ? cn : dstep; - - for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) - for( j = 0; j < cn; j++, src++ ) - { - int i1 = i > 0 ? -sdelta : 0; - int i0 = i > 1 ? -sdelta*2 : i1; - int i3 = i < len-1 ? sdelta : 0; - int i4 = i < len-2 ? sdelta*2 : i3; - WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4]; - - op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2); - op(p2, p4); op(p1, p3); op(p1, p2); - dst[j] = (T)p2; - } - return; - } - - size.width *= cn; - for( i = 0; i < size.height; i++, dst += dstep ) - { - const T* row[5]; - row[0] = src + std::max(i - 2, 0)*sstep; - row[1] = src + std::max(i - 1, 0)*sstep; - row[2] = src + i*sstep; - row[3] = src + std::min(i + 1, size.height-1)*sstep; - row[4] = src + std::min(i + 2, size.height-1)*sstep; - int limit = cn*2; - - for(j = 0;; ) - { - for( ; j < limit; j++ ) - { - WT p[25]; - int j1 = j >= cn ? j - cn : j; - int j0 = j >= cn*2 ? j - cn*2 : j1; - int j3 = j < size.width - cn ? j + cn : j; - int j4 = j < size.width - cn*2 ? j + cn*2 : j3; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1]; - p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3]; - p[k*5+4] = rowk[j4]; - } - - op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]); - op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]); - op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]); - op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]); - op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]); - op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]); - op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]); - op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]); - op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]); - op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]); - op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]); - op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]); - op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]); - op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]); - op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]); - op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]); - op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]); - op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]); - op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]); - op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]); - op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]); - op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]); - op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]); - dst[j] = (T)p[12]; - } - - if( limit == size.width ) - break; - -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE ) - { - WVT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn); - p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn); - p[k*5+4] = vop.wload(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); - } -#endif - for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE ) - { - VT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn); - p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn); - p[k*5+4] = vop.load(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); - } - - limit = size.width; - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } -} +namespace cv { #ifdef HAVE_OPENCL @@ -1160,7 +275,6 @@ static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) } } #endif -} void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) { @@ -1194,49 +308,10 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) return; #endif - bool useSortNet = ksize == 3 || (ksize == 5 -#if !(CV_SIMD) - && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) -#endif - ); - - Mat src; - if( useSortNet ) - { - if( dst.data != src0.data ) - src = src0; - else - src0.copyTo(src); - - if( src.depth() == CV_8U ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_16U ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_16S ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_32F ) - medianBlur_SortNet( src, dst, ksize ); - else - CV_Error(CV_StsUnsupportedFormat, ""); - - return; - } - else - { - cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE|BORDER_ISOLATED); - - int cn = src0.channels(); - CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) ); - - double img_size_mp = (double)(src0.total())/(1 << 20); - if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)* - (CV_SIMD ? 1 : 3)) - medianBlur_8u_Om( src, dst, ksize ); - else - medianBlur_8u_O1( src, dst, ksize ); - } + CV_CPU_DISPATCH(medianBlur, (src0, dst, ksize), + CV_CPU_DISPATCH_MODES_ALL); } -} +} // namespace /* End of file. */ diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index c98cd9215a..c3203f2a07 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -46,9 +46,11 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +#ifdef _MSC_VER +#pragma warning(disable: 4244) // warning C4244: 'argument': conversion from 'int' to 'ushort', possible loss of data + // triggered on intrinsic code from medianBlur_8u_O1() +#endif /* * This file includes the code, contributed by Simon Perreault @@ -71,12 +73,18 @@ Median Filter \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY static void medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) { + CV_INSTRUMENT_REGION(); + typedef ushort HT; /** @@ -330,9 +338,6 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) } } } -#if CV_SIMD - vx_cleanup(); -#endif } #undef HOP @@ -342,6 +347,8 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) static void medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + #define N 16 int zone0[4][N]; int zone1[4][N*N]; @@ -671,6 +678,8 @@ template static void medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + typedef typename Op::value_type T; typedef typename Op::arg_type WT; typedef typename VecOp::arg_type VT; @@ -770,9 +779,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif } else if( m == 5 ) { @@ -934,266 +940,15 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif } } -#ifdef HAVE_OPENCL +} // namespace anon -#define DIVUP(total, grain) ((total + grain - 1) / (grain)) - -static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m) -{ - size_t localsize[2] = { 16, 16 }; - size_t globalsize[2]; - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) ) - return false; - - Size imgSize = _src.size(); - bool useOptimized = (1 == cn) && - (size_t)imgSize.width >= localsize[0] * 8 && - (size_t)imgSize.height >= localsize[1] * 8 && - imgSize.width % 4 == 0 && - imgSize.height % 4 == 0 && - (ocl::Device::getDefault().isIntel()); - - cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ; - cv::String kdefs = useOptimized ? - format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type), - ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn) - : - format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ; - - ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() ); - - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(src.size(), type); - UMat dst = _dst.getUMat(); - - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); - - if( useOptimized ) - { - globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0]; - globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1]; - } - else - { - globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0]; - globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1]; - } - - return k.run(2, globalsize, localsize, false); -} - -#undef DIVUP - -#endif - -#ifdef HAVE_OPENVX -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 1280 * 720; } -} -static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize) -{ - if (_src.type() != CV_8UC1 || _dst.type() != CV_8U -#ifndef VX_VERSION_1_1 - || ksize != 3 -#endif - ) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ( -#ifdef VX_VERSION_1_1 - ksize != 3 ? ovx::skipSmallImages(src.cols, src.rows) : -#endif - ovx::skipSmallImages(src.cols, src.rows) - ) - return false; - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); -#ifdef VX_VERSION_1_1 - if ((vx_size)ksize > ctx.nonlinearMaxDimension()) - return false; -#endif - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(VX_BORDER_REPLICATE); -#ifdef VX_VERSION_1_1 - if (ksize == 3) -#endif - { - ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib)); - } -#ifdef VX_VERSION_1_1 - else - { - ivx::Matrix mtx; - if(ksize == 5) - mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize); - else - { - vx_size supportedSize; - ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize))); - if ((vx_size)ksize > supportedSize) - { - ctx.setImmediateBorder(prevBorder); - return false; - } - Mat mask(ksize, ksize, CV_8UC1, Scalar(255)); - mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize); - mtx.copyFrom(mask); - } - ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib)); - } -#endif - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; -} -#endif - -#ifdef HAVE_IPP -static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) -{ - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Degradations for big kernel - if(ksize > 7) - return false; -#endif - - { - int bufSize; - IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize); - IppDataType ippType = ippiGetDataType(src0.type()); - int channels = src0.channels(); - IppAutoBuffer buffer; - - if(src0.isSubmatrix()) - return false; - - Mat src; - if(dst.data != src0.data) - src = src0; - else - src0.copyTo(src); - - if(ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, ippType, channels, &bufSize) < 0) - return false; - - buffer.allocate(bufSize); - - switch(ippType) - { - case ipp8u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16s: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp32f: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_32f_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - default: - return false; - } - } -} -#endif -} - -void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize) { CV_INSTRUMENT_REGION(); - CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 )); - - if( ksize <= 1 || _src0.empty() ) - { - _src0.copyTo(_dst); - return; - } - - CV_OCL_RUN(_dst.isUMat(), - ocl_medianFilter(_src0,_dst, ksize)) - - Mat src0 = _src0.getMat(); - _dst.create( src0.size(), src0.type() ); - Mat dst = _dst.getMat(); - - CALL_HAL(medianBlur, cv_hal_medianBlur, src0.data, src0.step, dst.data, dst.step, src0.cols, src0.rows, src0.depth(), - src0.channels(), ksize); - - CV_OVX_RUN(true, - openvx_medianFilter(_src0, _dst, ksize)) - - CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize)); - -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize)) - return; -#endif - bool useSortNet = ksize == 3 || (ksize == 5 #if !(CV_SIMD) && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) @@ -1223,6 +978,7 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } else { + // TODO AVX guard (external call) cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE|BORDER_ISOLATED); int cn = src0.channels(); @@ -1237,6 +993,6 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } } -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From b99c9145bfbc4fca4d8358e988fe1322f807df88 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 19:20:24 +0000 Subject: [PATCH 6/9] imgproc: dispatch smooth --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/fixedpoint.inl.hpp | 5 +- modules/imgproc/src/smooth.dispatch.cpp | 1990 +---------------------- modules/imgproc/src/smooth.simd.hpp | 541 +----- 4 files changed, 49 insertions(+), 2488 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index c149edb9b3..d28d6b9046 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -5,4 +5,5 @@ ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp index a1a75a29e1..40b1c3faa1 100644 --- a/modules/imgproc/src/fixedpoint.inl.hpp +++ b/modules/imgproc/src/fixedpoint.inl.hpp @@ -9,10 +9,7 @@ #ifndef _CV_FIXEDPOINT_HPP_ #define _CV_FIXEDPOINT_HPP_ -#include "opencv2/core/softfloat.hpp" - -namespace -{ +namespace { class fixedpoint64 { diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index 909ffa919c..4e514eb8b8 100644 --- a/modules/imgproc/src/smooth.dispatch.cpp +++ b/modules/imgproc/src/smooth.dispatch.cpp @@ -52,13 +52,22 @@ #include "filter.hpp" +#include "opencv2/core/softfloat.hpp" + +namespace cv { #include "fixedpoint.inl.hpp" +} + +#include "smooth.simd.hpp" +#include "smooth.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { /****************************************************************************************\ Gaussian Blur \****************************************************************************************/ -cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) +Mat getGaussianKernel(int n, double sigma, int ktype) { CV_Assert(n > 0); const int SMALL_GAUSSIAN_SIZE = 7; @@ -112,8 +121,6 @@ cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) return kernel; } -namespace cv { - template static std::vector getFixedpointGaussianKernel( int n, double sigma ) { @@ -161,1964 +168,6 @@ static std::vector getFixedpointGaussianKernel( int n, double sigma ) return kernel; }; -template -void hlineSmooth1N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int) -{ - for (int i = 0; i < len*cn; i++, src++, dst++) - *dst = (*m) * (*src); -} -template <> -void hlineSmooth1N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int) -{ - int lencn = len*cn; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)); - for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i))); -#endif - for (; i < lencn; i++) - dst[i] = m[0] * src[i]; -} -template -void hlineSmooth1N1(const ET* src, int cn, const FT*, int, FT* dst, int len, int) -{ - for (int i = 0; i < len*cn; i++, src++, dst++) - *dst = *src; -} -template <> -void hlineSmooth1N1(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int) -{ - int lencn = len*cn; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i))); -#endif - for (; i < lencn; i++) - dst[i] = src[i]; -} -template -void hlineSmooth3N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[2] * src[cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[0] * src[src_idx*cn + k]; - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[2] * src[src_idx + k]; - } - } -} -template <> -void hlineSmooth3N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[2] * src[cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[0] * src[src_idx*cn + k]; - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1) + - v_mul_wrap(vx_load_expand(src + cn), v_mul2)); -#endif - for (; i < lencn; i++, src++, dst++) - *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[2] * src[src_idx + k]; - } - } -} -template -void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - if(borderType != BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = FT(src[k]); - else - for (int k = 0; k < cn; k++) - dst[k] = FT(src[k])>>1; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k])>>1) + (FT(src[cn + k])>>2); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (FT(src[src_idx*cn + k])>>2); - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1); - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k - cn])>>2) + (FT(src[k])>>1); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (FT(src[src_idx + k])>>2); - } - } -} -template <> -void hlineSmooth3N121(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType != BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = ufixedpoint16(src[k]); - else - for (int k = 0; k < cn; k++) - dst[k] = ufixedpoint16(src[k]) >> 1; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k])>>1) + (ufixedpoint16(src[cn + k])>>2); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (ufixedpoint16(src[src_idx*cn + k])>>2); - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << 6); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = (uint16_t(src[-cn]) + uint16_t(src[cn]) + (uint16_t(src[0]) << 1)) << 6; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k - cn])>>2) + (ufixedpoint16(src[k])>>1); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (ufixedpoint16(src[src_idx + k])>>2); - } - } -} -template -void hlineSmooth3Naba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k] + m[0] * src[src_idx*cn + k]; - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k]; - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = m[1] * src[0] + m[0] * src[-cn] + m[0] * src[cn]; - - // Point that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[k - cn] + m[0] * src[src_idx + k]; - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - } - } -} -template <> -void hlineSmooth3Naba(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[src_idx*cn + k])); - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k]; - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1)); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])); - - // Point that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[src_idx + k])); - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - } - } -} -template -void hlineSmooth5N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k+cn]; - dst[k+cn] = m[1] * src[k] + m[2] * src[k+cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k ] = m[1] * src[k + idxm1] + m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2*cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2*cn]; - dst[k + 2*cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2*cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2*cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2*cn] + m[0] * src[k + idxm1] + m[4] * src[k + idxp1]; - dst[k + 2*cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2*cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[cn + k] + m[4] * src[2*cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[3] * src[2*cn + k] + m[4] * src[3*cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2*cn; dst += 2*cn; - for (int i = 2*cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2*cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len+1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[4] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[3] * src[idxp1 + k] + m[4] * src[idxp2 + k]; - } - } - } -} -template <> -void hlineSmooth5N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[1] * src[k + idxm1] + m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2 * cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2 * cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2 * cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2 * cn] + m[0] * src[k + idxm1] + m[4] * src[k + idxp1]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[cn + k] + m[4] * src[2 * cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[3] * src[2 * cn + k] + m[4] * src[3 * cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2*cn, lencn = (len - 2)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - v_uint16 v_mul3 = vx_setall_u16(_m[3]); - v_uint16 v_mul4 = vx_setall_u16(_m[4]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2) + - v_mul_wrap(vx_load_expand(src + cn), v_mul3) + - v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4)); -#endif - for (; i < lencn; i++, src++, dst++) - *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2 * cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[4] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[3] * src[idxp1 + k] + m[4] * src[idxp2 + k]; - } - } - } -} -template -void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k])>>3)*(uint8_t)3; - else - for (int k = 0; k < cn; k++) - dst[k] = src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2); - dst[k + cn] = (FT(src[k]) >> 2) + (FT(src[k + cn])>>4)*(uint8_t)6; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + idxm1])>>2) + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>4) + (FT(src[k + idxm2])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp2])>>4); - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + 2 * cn])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2); - dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k])>>4); - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxm1])>>2) + (FT(src[k + 2 * cn])>>4) + (FT(src[k + idxm2])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp1])>>4); - dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k])>>4) + (FT(src[k + idxp2])>>4); - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[cn + k])>>2) + (FT(src[2 * cn + k])>>4); - dst[k + cn] = (FT(src[cn + k])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[2 * cn + k])>>2) + (FT(src[3 * cn + k])>>4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (FT(src[idxm2 + k])>>4) + (FT(src[idxm1 + k])>>2); - dst[k + cn] = dst[k + cn] + (FT(src[idxm1 + k])>>4); - } - } - - src += 2 * cn; dst += 2 * cn; - for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = (FT(src[0])>>4)*(uint8_t)6 + (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[-2 * cn])>>4) + (FT(src[2 * cn])>>4); - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k - cn])>>2) + (FT(src[k + cn])>>2) + (FT(src[k - 2 * cn])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k - cn])>>4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (FT(src[idxp1 + k])>>4); - dst[k + cn] = dst[k + cn] + (FT(src[idxp1 + k])>>2) + (FT(src[idxp2 + k])>>4); - } - } - } -} -template <> -void hlineSmooth5N14641(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k])>>3) * (uint8_t)3; - else - { - for (int k = 0; k < cn; k++) - dst[k] = src[k]; - } - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2); - dst[k + cn] = (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4); - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2); - dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k]) >> 4); - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp1]) >> 4); - dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4); - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[cn + k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 4); - dst[k + cn] = (ufixedpoint16(src[cn + k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 2) + (ufixedpoint16(src[3 * cn + k]) >> 4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (ufixedpoint16(src[idxm2 + k]) >> 4) + (ufixedpoint16(src[idxm1 + k]) >> 2); - dst[k + cn] = dst[k + cn] + (ufixedpoint16(src[idxm1 + k]) >> 4); - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_6 = vx_setall_u16(6); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k - cn]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k - 2 * cn]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k - cn]) >> 4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (ufixedpoint16(src[idxp1 + k]) >> 4); - dst[k + cn] = dst[k + cn] + (ufixedpoint16(src[idxp1 + k]) >> 2) + (ufixedpoint16(src[idxp2 + k]) >> 4); - } - } - } -} -template -void hlineSmooth5Nabcba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1])<<1) + m[2] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[1] * src[k + idxm1] + m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn] + m[0] * src[k + idxm1] + m[0] * src[k + idxp1]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[1] * src[2 * cn + k] + m[0] * src[3 * cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = m[0] * src[-2 * cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2 * cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2 * cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[1] * src[idxp1 + k] + m[0] * src[idxp2 + k]; - } - } - } -} -template <> -void hlineSmooth5Nabcba(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1]) << 1) + m[2] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxp1]) + (uint16_t)(src[k + idxm2])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + cn]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[2] * src[k + cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxm1])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + 2 * cn]) + (uint16_t)(src[k + idxm2])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[2] * src[k + cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp1])); - ((uint16_t*)dst)[k + 2 * cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + 2 * cn]; - } - } - } - else - { - // Points that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[idxm1 + k])) + ((uint16_t*)m)[0] * ((uint16_t)(src[2 * cn + k]) + (uint16_t)(src[idxm2 + k])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * ((uint16_t)(src[3 * cn + k]) + (uint16_t)(src[idxm1 + k])); - } - } - else - { - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * src[3 * cn + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2)); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0]; - - // Points that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - 2 * cn]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[idxp2 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[2] * src[k + cn]; - } - } - else - { - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * src[k - 2 * cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - } - } -} -template -void hlineSmooth(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift-i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - for (; i < (len - post_shift + 1)*cn; i++, src++, dst++) - { - *dst = m[0] * src[0]; - for (int j = 1; j < n; j++) - *dst = *dst + m[j] * src[j*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template <> -void hlineSmooth(const uint8_t* src, int cn, const ufixedpoint16* m, int n, ufixedpoint16* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ) - { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m))); - for (int j = 1; j < n; j++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j)))); - v_store((uint16_t*)dst, v_res0); - } -#endif - for (; i < lencn; i++, src++, dst++) - { - *dst = m[0] * src[0]; - for (int j = 1; j < n; j++) - *dst = *dst + m[j] * src[j*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template -void hlineSmoothONa_yzy_a(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - for (; i < (len - post_shift + 1)*cn; i++, src++, dst++) - { - *dst = m[pre_shift] * src[pre_shift*cn]; - for (int j = 0; j < pre_shift; j++) - *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n-1-j)*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template <> -void hlineSmoothONa_yzy_a(const uint8_t* src, int cn, const ufixedpoint16* m, int n, ufixedpoint16* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift)))); - for (int j = 0; j < pre_shift; j ++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j)))); - v_store((uint16_t*)dst, v_res0); - } -#endif - for (; i < lencn; i++, src++, dst++) - { - *dst = m[pre_shift] * src[pre_shift*cn]; - for (int j = 0; j < pre_shift; j++) - *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template -void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - const FT* src0 = src[0]; - for (int i = 0; i < len; i++) - dst[i] = *m * src0[i]; -} -template <> -void vlineSmooth1N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - const ufixedpoint16* src0 = src[0]; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1); - for (; i <= len - VECSZ; i += VECSZ) - v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul)); -#endif - for (; i < len; i++) - dst[i] = m[0] * src0[i]; -} -template -void vlineSmooth1N1(const FT* const * src, const FT*, int, ET* dst, int len) -{ - const FT* src0 = src[0]; - for (int i = 0; i < len; i++) - dst[i] = src0[i]; -} -template <> -void vlineSmooth1N1(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - const ufixedpoint16* src0 = src[0]; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= len - VECSZ; i += VECSZ) - v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i))); -#endif - for (; i < len; i++) - dst[i] = src0[i]; -} -template -void vlineSmooth3N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i]; -} -template <> -void vlineSmooth3N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - v_int16 v_mul2 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 2)))); - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01); - - v_int32 v_resj0, v_resj1; - const int16_t* src2 = (const int16_t*)src[2] + i; - v_src00 = vx_load(src2); - v_src01 = vx_load(src2 + VECSZ); - v_src02 = vx_load(src2 + 2*VECSZ); - v_src03 = vx_load(src2 + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i]; -} -template -void vlineSmooth3N121(const FT* const * src, const FT*, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = (FT::WT(src[0][i]) >> 2) + (FT::WT(src[2][i]) >> 2) + (FT::WT(src[1][i]) >> 1); -} -template <> -void vlineSmooth3N121(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= len - 2*VECSZ; i += 2*VECSZ) - { - v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; - v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01); - v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03); - v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11); - v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13); - v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21); - v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)), - v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13)))); - } -#endif - for (; i < len; i++) - dst[i] = (((uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[2]))[i]) + ((uint32_t)(((uint16_t*)(src[1]))[i]) << 1)) + (1 << 9)) >> 10; -} -template -void vlineSmooth5N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i]; -} -template <> -void vlineSmooth5N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - if (len >= 4 * VECSZ) - { - ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) }; - v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val)); - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2)))); - v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4)))); - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01); - - const int16_t* src2 = (const int16_t*)src[2] + i; - const int16_t* src3 = (const int16_t*)src[3] + i; - v_src00 = vx_load(src2); - v_src01 = vx_load(src2 + VECSZ); - v_src02 = vx_load(src2 + 2*VECSZ); - v_src03 = vx_load(src2 + 3*VECSZ); - v_src10 = vx_load(src3); - v_src11 = vx_load(src3 + VECSZ); - v_src12 = vx_load(src3 + 2*VECSZ); - v_src13 = vx_load(src3 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul23); - v_res1 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul23); - v_res3 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul23); - v_res5 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul23); - v_res7 += v_dotprod(v_tmp1, v_mul23); - - v_int32 v_resj0, v_resj1; - const int16_t* src4 = (const int16_t*)src[4] + i; - v_src00 = vx_load(src4); - v_src01 = vx_load(src4 + VECSZ); - v_src02 = vx_load(src4 + 2*VECSZ); - v_src03 = vx_load(src4 + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } - } -#endif - for (; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i]; -} -template -void vlineSmooth5N14641(const FT* const * src, const FT*, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = (FT::WT(src[2][i])*(uint8_t)6 + ((FT::WT(src[1][i]) + FT::WT(src[3][i]))<<2) + FT::WT(src[0][i]) + FT::WT(src[4][i])) >> 4; -} -template <> -void vlineSmooth5N14641(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - v_uint32 v_6 = vx_setall_u32(6); - const int VECSZ = v_uint16::nlanes; - for (; i <= len - 2*VECSZ; i += 2*VECSZ) - { - v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40; - v_uint32 v_src01, v_src11, v_src21, v_src31, v_src41; - v_uint32 v_src02, v_src12, v_src22, v_src32, v_src42; - v_uint32 v_src03, v_src13, v_src23, v_src33, v_src43; - v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01); - v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03); - v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11); - v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13); - v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21); - v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_expand(vx_load((uint16_t*)(src[3]) + i), v_src30, v_src31); - v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33); - v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41); - v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43); - v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40, - v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41), - v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42, - v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43))); - } -#endif - for (; i < len; i++) - dst[i] = ((uint32_t)(((uint16_t*)(src[2]))[i]) * 6 + - (((uint32_t)(((uint16_t*)(src[1]))[i]) + (uint32_t)(((uint16_t*)(src[3]))[i])) << 2) + - (uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[4]))[i]) + (1 << 11)) >> 12; -} -template -void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - { - typename FT::WT val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - val = val + m[j] * src[j][i]; - dst[i] = val; - } -} -template <> -void vlineSmooth(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint16 msum = m[0] + m[1]; - for (int j = 2; j < n; j++) - msum = msum + m[j]; - ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul); - - int j = 2; - for (; j < n - 1; j+=2) - { - v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j)))); - - const int16_t* srcj0 = (const int16_t*)src[j] + i; - const int16_t* srcj1 = (const int16_t*)src[j + 1] + i; - v_src00 = vx_load(srcj0); - v_src01 = vx_load(srcj0 + VECSZ); - v_src02 = vx_load(srcj0 + 2*VECSZ); - v_src03 = vx_load(srcj0 + 3*VECSZ); - v_src10 = vx_load(srcj1); - v_src11 = vx_load(srcj1 + VECSZ); - v_src12 = vx_load(srcj1 + 2*VECSZ); - v_src13 = vx_load(srcj1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul); - v_res3 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul); - v_res5 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul); - v_res7 += v_dotprod(v_tmp1, v_mul); - } - if(j < n) - { - v_int32 v_resj0, v_resj1; - v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j)))); - const int16_t* srcj = (const int16_t*)src[j] + i; - v_src00 = vx_load(srcj); - v_src01 = vx_load(srcj + VECSZ); - v_src02 = vx_load(srcj + 2*VECSZ); - v_src03 = vx_load(srcj + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - } - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - { - ufixedpoint32 val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - { - val = val + m[j] * src[j][i]; - } - dst[i] = val; - } -} -template -void vlineSmoothONa_yzy_a(const FT* const * src, const FT* m, int n, ET* dst, int len) -{ - int pre_shift = n / 2; - for (int i = 0; i < len; i++) - { - typename FT::WT val = m[pre_shift] * src[pre_shift][i]; - for (int j = 0; j < pre_shift; j++) - val = val + m[j] * src[j][i] + m[j] * src[(n - 1 - j)][i]; - dst[i] = val; - } -} -template <> -void vlineSmoothONa_yzy_a(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - int pre_shift = n / 2; - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1]; - for (int j = 1; j < pre_shift; j++) - msum = msum + m[j] + m[n - 1 - j]; - ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src20, v_src30, v_src01, v_src11, v_src21, v_src31; - v_int32 v_res0, v_res1, v_res2, v_res3, v_res4, v_res5, v_res6, v_res7; - v_int16 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7; - - v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + pre_shift)))); - const int16_t* srcp = (const int16_t*)src[pre_shift] + i; - v_src00 = vx_load(srcp); - v_src10 = vx_load(srcp + VECSZ); - v_src20 = vx_load(srcp + 2*VECSZ); - v_src30 = vx_load(srcp + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_res0, v_res1); - v_mul_expand(v_add_wrap(v_src10, v_128), v_mul, v_res2, v_res3); - v_mul_expand(v_add_wrap(v_src20, v_128), v_mul, v_res4, v_res5); - v_mul_expand(v_add_wrap(v_src30, v_128), v_mul, v_res6, v_res7); - - int j = 0; - for (; j < pre_shift; j++) - { - v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j)))); - - const int16_t* srcj0 = (const int16_t*)src[j] + i; - const int16_t* srcj1 = (const int16_t*)src[n - 1 - j] + i; - v_src00 = vx_load(srcj0); - v_src10 = vx_load(srcj0 + VECSZ); - v_src20 = vx_load(srcj0 + 2*VECSZ); - v_src30 = vx_load(srcj0 + 3*VECSZ); - v_src01 = vx_load(srcj1); - v_src11 = vx_load(srcj1 + VECSZ); - v_src21 = vx_load(srcj1 + 2*VECSZ); - v_src31 = vx_load(srcj1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3); - v_res2 += v_dotprod(v_tmp2, v_mul); - v_res3 += v_dotprod(v_tmp3, v_mul); - v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5); - v_res4 += v_dotprod(v_tmp4, v_mul); - v_res5 += v_dotprod(v_tmp5, v_mul); - v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7); - v_res6 += v_dotprod(v_tmp6, v_mul); - v_res7 += v_dotprod(v_tmp7, v_mul); - } - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - { - ufixedpoint32 val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - { - val = val + m[j] * src[j][i]; - } - dst[i] = val; - } -} -template -class fixedSmoothInvoker : public ParallelLoopBody -{ -public: - fixedSmoothInvoker(const ET* _src, size_t _src_stride, ET* _dst, size_t _dst_stride, - int _width, int _height, int _cn, const FT* _kx, int _kxlen, const FT* _ky, int _kylen, int _borderType) : ParallelLoopBody(), - src(_src), dst(_dst), src_stride(_src_stride), dst_stride(_dst_stride), - width(_width), height(_height), cn(_cn), kx(_kx), ky(_ky), kxlen(_kxlen), kylen(_kylen), borderType(_borderType) - { - if (kxlen == 1) - { - if (kx[0] == FT::one()) - hlineSmoothFunc = hlineSmooth1N1; - else - hlineSmoothFunc = hlineSmooth1N; - } - else if (kxlen == 3) - { - if (kx[0] == (FT::one()>>2)&&kx[1] == (FT::one()>>1)&&kx[2] == (FT::one()>>2)) - hlineSmoothFunc = hlineSmooth3N121; - else if ((kx[0] - kx[2]).isZero()) - hlineSmoothFunc = hlineSmooth3Naba; - else - hlineSmoothFunc = hlineSmooth3N; - } - else if (kxlen == 5) - { - if (kx[2] == (FT::one()*(uint8_t)3>>3) && - kx[1] == (FT::one()>>2) && kx[3] == (FT::one()>>2) && - kx[0] == (FT::one()>>4) && kx[4] == (FT::one()>>4)) - hlineSmoothFunc = hlineSmooth5N14641; - else if (kx[0] == kx[4] && kx[1] == kx[3]) - hlineSmoothFunc = hlineSmooth5Nabcba; - else - hlineSmoothFunc = hlineSmooth5N; - } - else if (kxlen % 2 == 1) - { - hlineSmoothFunc = hlineSmoothONa_yzy_a; - for (int i = 0; i < kxlen / 2; i++) - if (!(kx[i] == kx[kxlen - 1 - i])) - { - hlineSmoothFunc = hlineSmooth; - break; - } - } - else - hlineSmoothFunc = hlineSmooth; - if (kylen == 1) - { - if (ky[0] == FT::one()) - vlineSmoothFunc = vlineSmooth1N1; - else - vlineSmoothFunc = vlineSmooth1N; - } - else if (kylen == 3) - { - if (ky[0] == (FT::one() >> 2) && ky[1] == (FT::one() >> 1) && ky[2] == (FT::one() >> 2)) - vlineSmoothFunc = vlineSmooth3N121; - else - vlineSmoothFunc = vlineSmooth3N; - } - else if (kylen == 5) - { - if (ky[2] == (FT::one() * (uint8_t)3 >> 3) && - ky[1] == (FT::one() >> 2) && ky[3] == (FT::one() >> 2) && - ky[0] == (FT::one() >> 4) && ky[4] == (FT::one() >> 4)) - vlineSmoothFunc = vlineSmooth5N14641; - else - vlineSmoothFunc = vlineSmooth5N; - } - else if (kylen % 2 == 1) - { - vlineSmoothFunc = vlineSmoothONa_yzy_a; - for (int i = 0; i < kylen / 2; i++) - if (!(ky[i] == ky[kylen - 1 - i])) - { - vlineSmoothFunc = vlineSmooth; - break; - } - } - else - vlineSmoothFunc = vlineSmooth; - } - virtual void operator() (const Range& range) const CV_OVERRIDE - { - AutoBuffer _buf(width*cn*kylen); - FT* buf = _buf.data(); - AutoBuffer _ptrs(kylen*2); - FT** ptrs = _ptrs.data(); - - if (kylen == 1) - { - ptrs[0] = buf; - for (int i = range.start; i < range.end; i++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[0], width, borderType); - vlineSmoothFunc(ptrs, ky, kylen, dst + i * dst_stride, width*cn); - } - } - else if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int pre_shift = kylen / 2; - int post_shift = kylen - pre_shift - 1; - // First line evaluation - int idst = range.start; - int ifrom = max(0, idst - pre_shift); - int ito = idst + post_shift + 1; - int i = ifrom; - int bufline = 0; - for (; i < min(ito, height); i++, bufline++) - { - ptrs[bufline+kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - for (; i < ito; i++, bufline++) - { - int src_idx = borderInterpolate(i, height, borderType); - if (src_idx < ifrom) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - else - { - ptrs[bufline + kylen] = ptrs[bufline] = ptrs[src_idx - ifrom]; - } - } - for (int j = idst - pre_shift; j < 0; j++) - { - int src_idx = borderInterpolate(j, height, borderType); - if (src_idx >= ito) - { - ptrs[2*kylen + j] = ptrs[kylen + j] = buf + (kylen + j) * width*cn; - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[kylen + j], width, borderType); - } - else - { - ptrs[2*kylen + j] = ptrs[kylen + j] = ptrs[src_idx]; - } - } - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); idst++; - - // border mode dependent part evaluation - // i points to last src row to evaluate in convolution - bufline %= kylen; ito = min(height, range.end + post_shift); - for (; i < min(kylen, ito); i++, idst++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - // Points inside the border - for (; i < ito; i++, idst++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - // Points that could fall below border - for (; i < range.end + post_shift; i++, idst++) - { - int src_idx = borderInterpolate(i, height, borderType); - if ((i - src_idx) > kylen) - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - else - ptrs[bufline + kylen] = ptrs[bufline] = ptrs[(bufline + kylen - (i - src_idx)) % kylen]; - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - } - else - { - int pre_shift = kylen / 2; - int post_shift = kylen - pre_shift - 1; - // First line evaluation - int idst = range.start; - int ifrom = idst - pre_shift; - int ito = min(idst + post_shift + 1, height); - int i = max(0, ifrom); - int bufline = 0; - for (; i < ito; i++, bufline++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - - if (bufline == 1) - vlineSmooth1N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else if (bufline == 3) - vlineSmooth3N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else if (bufline == 5) - vlineSmooth5N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - idst++; - - // border mode dependent part evaluation - // i points to last src row to evaluate in convolution - bufline %= kylen; ito = min(height, range.end + post_shift); - for (; i < min(kylen, ito); i++, idst++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline++; - if (bufline == 3) - vlineSmooth3N(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - else if (bufline == 5) - vlineSmooth5N(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - bufline %= kylen; - } - // Points inside the border - if (i - max(0, ifrom) >= kylen) - { - for (; i < ito; i++, idst++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - - // Points that could fall below border - // i points to first src row to evaluate in convolution - bufline = (bufline + 1) % kylen; - for (i = idst - pre_shift; i < range.end - pre_shift; i++, idst++, bufline++) - if (height - i == 3) - vlineSmooth3N(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - else if (height - i == 5) - vlineSmooth5N(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - } - else - { - // i points to first src row to evaluate in convolution - for (i = idst - pre_shift; i < min(range.end - pre_shift, 0); i++, idst++) - if (height == 3) - vlineSmooth3N(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - else if (height == 5) - vlineSmooth5N(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - for (; i < range.end - pre_shift; i++, idst++) - if (height - i == 3) - vlineSmooth3N(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - else if (height - i == 5) - vlineSmooth5N(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - } - } - } -private: - const ET* src; - ET* dst; - size_t src_stride, dst_stride; - int width, height, cn; - const FT *kx, *ky; - int kxlen, kylen; - int borderType; - void(*hlineSmoothFunc)(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType); - void(*vlineSmoothFunc)(const FT* const * src, const FT* m, int n, ET* dst, int len); - - fixedSmoothInvoker(const fixedSmoothInvoker&); - fixedSmoothInvoker& operator=(const fixedSmoothInvoker&); -}; - static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } @@ -2149,9 +198,7 @@ static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); } -} - -cv::Ptr cv::createGaussianFilter( int type, Size ksize, +Ptr createGaussianFilter( int type, Size ksize, double sigma1, double sigma2, int borderType ) { @@ -2161,8 +208,6 @@ cv::Ptr cv::createGaussianFilter( int type, Size ksize, return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); } -namespace cv -{ #ifdef HAVE_OPENCL static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, @@ -2431,11 +476,10 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, #endif } #endif -} -void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType ) +void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, + int borderType) { CV_INSTRUMENT_REGION(); @@ -2497,14 +541,16 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); if (src.data == dst.data) src = src.clone(); - fixedSmoothInvoker invoker(src.ptr(), src.step1(), dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED); - parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs()))); + CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType), + CV_CPU_DISPATCH_MODES_ALL); return; } sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } +} // namespace + ////////////////////////////////////////////////////////////////////////////////////////// CV_IMPL void diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp index 909ffa919c..4f52bc0d80 100644 --- a/modules/imgproc/src/smooth.simd.hpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -46,120 +46,28 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" - -#include "opencv2/core/openvx/ovx_defs.hpp" #include "filter.hpp" -#include "fixedpoint.inl.hpp" - -/****************************************************************************************\ - Gaussian Blur -\****************************************************************************************/ - -cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) -{ - CV_Assert(n > 0); - const int SMALL_GAUSSIAN_SIZE = 7; - static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = - { - {1.f}, - {0.25f, 0.5f, 0.25f}, - {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, - {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} - }; - - const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? - small_gaussian_tab[n>>1] : 0; - - CV_Assert( ktype == CV_32F || ktype == CV_64F ); - Mat kernel(n, 1, ktype); - float* cf = kernel.ptr(); - double* cd = kernel.ptr(); - - double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8; - double scale2X = -0.5/(sigmaX*sigmaX); - double sum = 0; - - int i; - for( i = 0; i < n; i++ ) - { - double x = i - (n-1)*0.5; - double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); - if( ktype == CV_32F ) - { - cf[i] = (float)t; - sum += cf[i]; - } - else - { - cd[i] = t; - sum += cd[i]; - } - } - - CV_DbgAssert(fabs(sum) > 0); - sum = 1./sum; - for( i = 0; i < n; i++ ) - { - if( ktype == CV_32F ) - cf[i] = (float)(cf[i]*sum); - else - cd[i] *= sum; - } - - return kernel; -} +#include "opencv2/core/softfloat.hpp" namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType); -template -static std::vector getFixedpointGaussianKernel( int n, double sigma ) -{ - if (sigma <= 0) - { - if(n == 1) - return std::vector(1, softdouble(1.0)); - else if(n == 3) - { - T v3[] = { softdouble(0.25), softdouble(0.5), softdouble(0.25) }; - return std::vector(v3, v3 + 3); - } - else if(n == 5) - { - T v5[] = { softdouble(0.0625), softdouble(0.25), softdouble(0.375), softdouble(0.25), softdouble(0.0625) }; - return std::vector(v5, v5 + 5); - } - else if(n == 7) - { - T v7[] = { softdouble(0.03125), softdouble(0.109375), softdouble(0.21875), softdouble(0.28125), softdouble(0.21875), softdouble(0.109375), softdouble(0.03125) }; - return std::vector(v7, v7 + 7); - } - } +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#if defined(CV_CPU_BASELINE_MODE) +// included in dispatch.cpp +#else +#include "fixedpoint.inl.hpp" +#endif - softdouble sigmaX = sigma > 0 ? softdouble(sigma) : mulAdd(softdouble(n),softdouble(0.15),softdouble(0.35));// softdouble(((n-1)*0.5 - 1)*0.3 + 0.8) - softdouble scale2X = softdouble(-0.5*0.25)/(sigmaX*sigmaX); - std::vector values(n); - softdouble sum(0.); - for(int i = 0, x = 1 - n; i < n; i++, x+=2 ) - { - // x = i - (n - 1)*0.5 - // t = std::exp(scale2X*x*x) - values[i] = exp(softdouble(x*x)*scale2X); - sum += values[i]; - } - sum = softdouble::one()/sum; - - std::vector kernel(n); - for(int i = 0; i < n; i++ ) - { - kernel[i] = values[i] * sum; - } - - return kernel; -}; +namespace { template void hlineSmooth1N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int) @@ -2119,418 +2027,27 @@ private: fixedSmoothInvoker& operator=(const fixedSmoothInvoker&); }; -static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } -template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } +} // namespace anon -template -static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, - double sigma1, double sigma2 ) -{ - int depth = CV_MAT_DEPTH(type); - if( sigma2 <= 0 ) - sigma2 = sigma1; - - // automatic detection of kernel size from sigma - if( ksize.width <= 0 && sigma1 > 0 ) - ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - if( ksize.height <= 0 && sigma2 > 0 ) - ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - - CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && - ksize.height > 0 && ksize.height % 2 == 1 ); - - sigma1 = std::max( sigma1, 0. ); - sigma2 = std::max( sigma2, 0. ); - - getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F), kx ); - if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) - ky = kx; - else - getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); -} - -} - -cv::Ptr cv::createGaussianFilter( int type, Size ksize, - double sigma1, double sigma2, - int borderType ) -{ - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); -} - -namespace cv -{ -#ifdef HAVE_OPENCL - -static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, - InputArray _kernelX, InputArray _kernelY, int borderType) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - ((ksize.width == 5 && (_src.cols() % 4 == 0)) || - (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) ) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - - if (ksize.width == 3) - { - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - } - else if (ksize.width == 5) - { - globalsize[0] = size.width / 4; - globalsize[1] = size.height / 1; - } - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - char build_opts[1024]; - sprintf(build_opts, "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED], - ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); - - ocl::Kernel kernel; - - if (ksize.width == 3) - kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); - else if (ksize.width == 5) - kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts); - - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -#endif - -#ifdef HAVE_OPENVX - -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 320 * 240; } -} -static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType) -{ - if (sigma2 <= 0) - sigma2 = sigma1; - // automatic detection of kernel size from sigma - if (ksize.width <= 0 && sigma1 > 0) - ksize.width = cvRound(sigma1*6 + 1) | 1; - if (ksize.height <= 0 && sigma2 > 0) - ksize.height = cvRound(sigma2*6 + 1) | 1; - - if (_src.type() != CV_8UC1 || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3) - return false; - - sigma1 = std::max(sigma1, 0.); - sigma2 = std::max(sigma2, 0.); - - if (!(sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) || !(sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - return true; -} - -#endif - -#ifdef HAVE_IPP -// IW 2017u2 has bug which doesn't allow use of partial inMem with tiling -#if IPP_DISABLE_GAUSSIANBLUR_PARALLEL -#define IPP_GAUSSIANBLUR_PARALLEL 0 -#else -#define IPP_GAUSSIANBLUR_PARALLEL 1 -#endif - -#ifdef HAVE_IPP_IW - -class ipp_gaussianBlurParallel: public ParallelLoopBody -{ -public: - ipp_gaussianBlurParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, int kernelSize, float sigma, ::ipp::IwiBorderType &border, bool *pOk): - m_src(src), m_dst(dst), m_kernelSize(kernelSize), m_sigma(sigma), m_border(border), m_pOk(pOk) { - *m_pOk = true; - } - ~ipp_gaussianBlurParallel() - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - CV_INSTRUMENT_REGION_IPP(); - - if(!*m_pOk) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, m_src, m_dst, m_kernelSize, m_sigma, ::ipp::IwDefault(), m_border, tile); - } - catch(const ::ipp::IwException &) - { - *m_pOk = false; - return; - } - } -private: - ::ipp::IwiImage &m_src; - ::ipp::IwiImage &m_dst; - - int m_kernelSize; - float m_sigma; - ::ipp::IwiBorderType &m_border; - - volatile bool *m_pOk; - const ipp_gaussianBlurParallel& operator= (const ipp_gaussianBlurParallel&); -}; - -#endif - -static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType ) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__)) - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; // bug on ia32 -#else - if(sigma1 != sigma2) - return false; - - if(sigma1 < FLT_EPSILON) - return false; - - if(ksize.width != ksize.height) - return false; - - // Acquire data and begin processing - try - { - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize)); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_GAUSSIANBLUR_PARALLEL && threads > 1) { - bool ok; - ipp_gaussianBlurParallel invoker(iwSrc, iwDst, ksize.width, (float) sigma1, ippBorder, &ok); - - if(!ok) - return false; - const Range range(0, (int) iwDst.m_size.height); - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, iwSrc, iwDst, ksize.width, sigma1, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#endif -#else - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; -#endif -} -#endif -} - -void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType ) +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType) { CV_INSTRUMENT_REGION(); - int type = _src.type(); - Size size = _src.size(); - _dst.create( size, type ); - - if( (borderType & ~BORDER_ISOLATED) != BORDER_CONSTANT && - ((borderType & BORDER_ISOLATED) != 0 || !_src.getMat().isSubmatrix()) ) + CV_Assert(src.depth() == CV_8U && ((borderType & BORDER_ISOLATED) || !src.isSubmatrix())); + fixedSmoothInvoker invoker( + src.ptr(), src.step1(), + dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), + (const ufixedpoint16*)fkx, fkx_size, (const ufixedpoint16*)fky, fky_size, + borderType & ~BORDER_ISOLATED); { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - if( ksize.width == 1 && ksize.height == 1 ) - { - _src.copyTo(_dst); - return; - } - - bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && - ((ksize.width == 3 && ksize.height == 3) || - (ksize.width == 5 && ksize.height == 5)) && - _src.rows() > ksize.height && _src.cols() > ksize.width); - CV_UNUSED(useOpenCL); - - int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType & BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - sigma1, sigma2, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) - - CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); - - if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) - { - std::vector fkx, fky; - createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); - if (src.data == dst.data) - src = src.clone(); - fixedSmoothInvoker invoker(src.ptr(), src.step1(), dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED); + // TODO AVX guard (external call) parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs()))); - return; } - - sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } -////////////////////////////////////////////////////////////////////////////////////////// - -CV_IMPL void -cvSmooth( const void* srcarr, void* dstarr, int smooth_type, - int param1, int param2, double param3, double param4 ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0; - - CV_Assert( dst.size() == src.size() && - (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) ); - - if( param2 <= 0 ) - param2 = param1; - - if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ) - cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1), - smooth_type == CV_BLUR, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_GAUSSIAN ) - cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_MEDIAN ) - cv::medianBlur( src, dst, param1 ); - else - cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE ); - - if( dst.data != dst0.data ) - CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From ce3c92eb1f4c38989ad6622e62fd1fb3a2cb3140 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 19:58:48 +0000 Subject: [PATCH 7/9] imgproc: dispatch bilateral_filter --- modules/imgproc/CMakeLists.txt | 1 + .../imgproc/src/bilateral_filter.dispatch.cpp | 975 +----------------- modules/imgproc/src/bilateral_filter.simd.hpp | 394 +------ 3 files changed, 51 insertions(+), 1319 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d28d6b9046..9731694e59 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,5 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp index e9181f2182..a27ebb18f5 100644 --- a/modules/imgproc/src/bilateral_filter.dispatch.cpp +++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp @@ -48,493 +48,14 @@ #include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" +#include "bilateral_filter.simd.hpp" +#include "bilateral_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ - -class BilateralFilter_8u_Invoker : - public ParallelLoopBody -{ -public: - BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, - int* _space_ofs, float *_space_weight, float *_color_weight) : - temp(&_temp), dest(&_dest), radius(_radius), - maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, cn = dest->channels(), k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const uchar* sptr = temp->ptr(i+radius) + radius*cn; - uchar* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 rval = vx_load_expand_q(sptr + j); - - v_uint32 val = vx_load_expand_q(ksptr0 + j); - v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)); - - val = vx_load_expand_q(ksptr1 + j); - w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr2 + j); - w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr3 + j); - w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_uint32x4 rval = v_setall_u32(sptr[j]); - v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w); -#else - int rval = sptr[j]; - - int val = ksptr0[j]; - float w = space_weight[k] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr1[j]; - w = space_weight[k+1] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr2[j]; - w = space_weight[k+2] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr3[j]; - w = space_weight[k+3] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 val = vx_load_expand_q(ksptr + j); - v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - int val = ksptr[j]; - float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; - wsum[j] += w; - sum[j] += val * w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) - v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), - v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); -#endif - for (; j < size.width; j++) - { - // overflow is not possible here => there is no need to use cv::saturate_cast - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); - } - } - else - { - assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes, - ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_uint16 val0, val1, val2, val3, val4; - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_uint32 vall, valh; - v_expand(val0, vall, valh); - v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - - v_load_deinterleave(ksptr1, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr2, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr3, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_uint32x4 rb = v_setall_u32(rsptr[0]); - v_uint32x4 rg = v_setall_u32(rsptr[1]); - v_uint32x4 rr = v_setall_u32(rsptr[2]); - v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr))); - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w); - sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w); - sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w); -#else - int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - - int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; - v_expand(v_absdiff(kb, rb), b_l, b_h); - v_expand(v_absdiff(kg, rg), g_l, g_h); - v_expand(v_absdiff(kr, rr), r_l, r_h); - - v_uint32 val0, val1, val2, val3; - v_expand(b_l + g_l + r_l, val0, val1); - v_expand(b_h + g_h + r_h, val2, val3); - - v_expand(kb, b_l, b_h); - v_expand(kg, g_l, g_h); - v_expand(kr, r_l, r_h); - - v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); - v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); - v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); - v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); - v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); - v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); - v_expand(b_l, val0, val1); - v_expand(b_h, val2, val3); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); - v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); - v_expand(g_l, val0, val1); - v_expand(g_h, val2, val3); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); - v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); - v_expand(r_l, val0, val1); - v_expand(r_h, val2, val3); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - } -#endif - for(; j < size.width; j++, ksptr += 3, rsptr += 3) - { - int b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - } - } - j = 0; -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) - { - v_float32 w0 = v_one / vx_load_aligned(wsum + j); - v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); - v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); - v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); - - v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), - v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), - v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), - v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); - } -#endif - for(; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f/wsum[j]; - *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - const Mat *temp; - Mat *dest; - int radius, maxk, *space_ofs; - float *space_weight, *color_weight; -}; +namespace cv { #ifdef HAVE_OPENCL @@ -542,6 +63,7 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, double sigma_color, double sigma_space, int borderType) { + CV_INSTRUMENT_REGION(); #ifdef __ANDROID__ if (ocl::Device::getDefault().isNVidia()) return false; @@ -628,16 +150,18 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; return k.run(2, globalsize, NULL, false); } - #endif + + static void bilateralFilter_8u( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; - Size size = src.size(); CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); @@ -686,479 +210,18 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, } } - BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_8u, (dst, temp, radius, maxk, space_ofs, space_weight, color_weight), + CV_CPU_DISPATCH_MODES_ALL); } -class BilateralFilter_32f_Invoker : - public ParallelLoopBody -{ -public: - - BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, - const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : - cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), - temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const float* sptr = temp->ptr(i+radius) + radius*cn; - float* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for(; k <= maxk - 4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k + 1]; - const float* ksptr2 = sptr + space_ofs[k + 2]; - const float* ksptr3 = sptr + space_ofs[k + 3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 rval = vx_load(sptr + j); - - v_float32 val = vx_load(ksptr0 + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j)); - - val = vx_load(ksptr1 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr2 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr3 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_float32x4 rval = v_setall_f32(sptr[j]); - v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 knan = v_not_nan(val); - v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum((val & knan) * w); -#else - float rval = sptr[j]; - - float val = ksptr0[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr1[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr2[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr3[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } -#endif - } - } - for(; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 val = vx_load(ksptr + j); - v_float32 rval = vx_load(sptr + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - float val = ksptr[j]; - float rval = sptr[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 v_val = vx_load(sptr + j); - v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val)))); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f); - } - } - else - { - CV_Assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for (; k <= maxk-4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k+1]; - const float* ksptr2 = sptr + space_ofs[k+2]; - const float* ksptr3 = sptr + space_ofs[k+3]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes, - ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)); - v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)); - v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)); - - v_load_deinterleave(ksptr1, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr2, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr3, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum_b + j, v_sum_b); - v_store_aligned(sum_g + j, v_sum_g); - v_store_aligned(sum_r + j, v_sum_r); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_float32x4 rb = v_setall_f32(rsptr[0]); - v_float32x4 rg = v_setall_f32(rsptr[1]); - v_float32x4 rr = v_setall_f32(rsptr[2]); - v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum((kb & knan) * w); - sum_g[j] += v_reduce_sum((kg & knan) * w); - sum_r[j] += v_reduce_sum((kr & knan) * w); -#else - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - - float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } -#endif - } - } - for (; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j))); - } -#endif - for (; j < size.width; j++, ksptr += 3, rsptr += 3) - { - float b = ksptr[0], g = ksptr[1], r = ksptr[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes) - { - v_float32 b, g, r; - v_load_deinterleave(sptr, b, g, r); - v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r); - v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask)); - v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - float b = *(sptr++); - float g = *(sptr++); - float r = *(sptr++); - if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r)) - { - wsum[j] = 1.f / wsum[j]; - *(dptr++) = sum_b[j] * wsum[j]; - *(dptr++) = sum_g[j] * wsum[j]; - *(dptr++) = sum_r[j] * wsum[j]; - } - else - { - wsum[j] = 1.f / (wsum[j] + 1.f); - *(dptr++) = (sum_b[j] + b) * wsum[j]; - *(dptr++) = (sum_g[j] + g) * wsum[j]; - *(dptr++) = (sum_r[j] + r) * wsum[j]; - } - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - int cn, radius, maxk, *space_ofs; - const Mat* temp; - Mat *dest; - float scale_index, *space_weight, *expLUT; -}; - - static void bilateralFilter_32f( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; double minValSrc=-1, maxValSrc=1; @@ -1166,7 +229,6 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, int kExpNumBins = 0; float lastExpVal = 1.f; float len, scale_index; - Size size = src.size(); CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); @@ -1236,9 +298,8 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, } // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_32f, (cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT), + CV_CPU_DISPATCH_MODES_ALL); } #ifdef HAVE_IPP @@ -1339,9 +400,7 @@ static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, do } #endif -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, +void bilateralFilter( InputArray _src, OutputArray _dst, int d, double sigmaColor, double sigmaSpace, int borderType ) { @@ -1365,4 +424,4 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, "Bilateral filtering is only implemented for 8u and 32f images" ); } -/* End of file. */ +} // namespace diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp index e9181f2182..65abcd4e40 100644 --- a/modules/imgproc/src/bilateral_filter.simd.hpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -43,18 +43,25 @@ #include "precomp.hpp" -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight); +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT); +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace { class BilateralFilter_8u_Invoker : public ParallelLoopBody { @@ -68,6 +75,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, cn = dest->channels(), k; Size size = dest->size(); @@ -536,161 +545,20 @@ private: float *space_weight, *color_weight; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, - double sigma_color, double sigma_space, - int borderType) +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight) { -#ifdef __ANDROID__ - if (ocl::Device::getDefault().isNVidia()) - return false; -#endif - - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int i, j, maxk, radius; - - if (depth != CV_8U || cn > 4) - return false; - - if (sigma_color <= 0) - sigma_color = 1; - if (sigma_space <= 0) - sigma_space = 1; - - double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); - double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); - - if ( d <= 0 ) - radius = cvRound(sigma_space * 1.5); - else - radius = d / 2; - radius = MAX(radius, 1); - d = radius * 2 + 1; - - UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; - if (src.u == dst.u) - return false; - - copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _space_weight(d * d); - std::vector _space_ofs(d * d); - float * const space_weight = &_space_weight[0]; - int * const space_ofs = &_space_ofs[0]; - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i * i + (double)j * j); - if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - String kernelName("bilateral"); - size_t sizeDiv = 1; - if ((ocl::Device::getDefault().isIntel()) && - (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) - { - //Intel GPU - if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. - { - kernelName = "bilateral_float4"; - sizeDiv = 4; - } - } - ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); - if (k.empty()) - return false; - - Mat mspace_weight(1, d * d, CV_32FC1, space_weight); - Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - - mspace_weight.copyTo(uspace_weight); - mspace_ofs.copyTo(uspace_ofs); - - k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(uspace_weight), - ocl::KernelArg::PtrReadOnly(uspace_ofs)); - - size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; - return k.run(2, globalsize, NULL, false); -} - -#endif -static void -bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - Size size = src.size(); - - CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - std::vector _color_weight(cn*256); - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* color_weight = &_color_weight[0]; - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // initialize color-related bilateral filter coefficients - - for( i = 0; i < 256*cn; i++ ) - color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; - - for( ; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*temp.step + j*cn); - } - } - + CV_INSTRUMENT_REGION(); BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } +namespace { + class BilateralFilter_32f_Invoker : public ParallelLoopBody { @@ -705,6 +573,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k; Size size = dest->size(); @@ -1153,216 +1023,18 @@ private: float scale_index, *space_weight, *expLUT; }; +} // namespace anon -static void -bilateralFilter_32f( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - double minValSrc=-1, maxValSrc=1; - const int kExpNumBinsPerChannel = 1 << 12; - int kExpNumBins = 0; - float lastExpVal = 1.f; - float len, scale_index; - Size size = src.size(); - - CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - // compute the min/max range for the input image (even if multichannel) - - minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - src.copyTo(dst); - return; - } - - // temporary copy of the image with borders for easy processing - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - // allocate lookup tables - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // assign a length which is slightly more than needed - len = (float)(maxValSrc - minValSrc) * cn; - kExpNumBins = kExpNumBinsPerChannel * cn; - std::vector _expLUT(kExpNumBins+2); - float* expLUT = &_expLUT[0]; - - scale_index = kExpNumBins/len; - - // initialize the exp LUT - for( i = 0; i < kExpNumBins+2; i++ ) - { - if( lastExpVal > 0.f ) - { - double val = i / scale_index; - expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); - lastExpVal = expLUT[i]; - } - else - expLUT[i] = 0.f; - } - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius || ( i == 0 && j == 0 ) ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); - } - - // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); -} - -#ifdef HAVE_IPP -#define IPP_BILATERAL_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_bilateralFilterParallel: public ParallelLoopBody -{ -public: - ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): - src(_src), dst(_dst) - { - pOk = _ok; - - radius = _radius; - valSquareSigma = _valSquareSigma; - posSquareSigma = _posSquareSigma; - borderType = _borderType; - - *pOk = true; - } - ~ipp_bilateralFilterParallel() {} - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - if(*pOk == false) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); - } - catch(const ::ipp::IwException &) - { - *pOk = false; - return; - } - } -private: - ::ipp::IwiImage &src; - ::ipp::IwiImage &dst; - - int radius; - Ipp32f valSquareSigma; - Ipp32f posSquareSigma; - ::ipp::IwiBorderType borderType; - - bool *pOk; - const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); -}; -#endif - -static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); - Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); - Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); - - // Acquire data and begin processing - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize(radius); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_BILATERAL_PARALLEL && threads > 1) { - bool ok = true; - Range range(0, (int)iwDst.m_size.height); - ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); - if(!ok) - return false; - - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); - return false; -#endif -} -#endif - -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, - double sigmaColor, double sigmaSpace, - int borderType ) +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT) { CV_INSTRUMENT_REGION(); - _dst.create( _src.size(), _src.type() ); - - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) - - Mat src = _src.getMat(), dst = _dst.getMat(); - - CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); - - if( src.depth() == CV_8U ) - bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else if( src.depth() == CV_32F ) - bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else - CV_Error( CV_StsUnsupportedFormat, - "Bilateral filtering is only implemented for 8u and 32f images" ); + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 5a01227aa1ccf971d17484ad1e1cc7c73aafd1dc Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 20:19:05 +0000 Subject: [PATCH 8/9] imgproc: dispatch box_filter --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/box_filter.dispatch.cpp | 1307 +------------------ modules/imgproc/src/box_filter.simd.hpp | 548 +------- 3 files changed, 78 insertions(+), 1778 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 9731694e59..d60fa7c58f 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,6 +1,7 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) +ocv_add_dispatched_file(box_filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/box_filter.dispatch.cpp b/modules/imgproc/src/box_filter.dispatch.cpp index 14f266258f..154ccfd09e 100644 --- a/modules/imgproc/src/box_filter.dispatch.cpp +++ b/modules/imgproc/src/box_filter.dispatch.cpp @@ -50,1119 +50,11 @@ #include "opencv2/core/openvx/ovx_defs.hpp" -namespace cv -{ +#include "box_filter.simd.hpp" +#include "box_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -/****************************************************************************************\ - Box Filter -\****************************************************************************************/ -template -struct RowSum : - public BaseRowFilter -{ - RowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - if( ksize == 3 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2]; - } - } - else if( ksize == 5 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4]; - } - } - else if( cn == 1 ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i++ ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i++ ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+1] = s; - } - } - else if( cn == 3 ) - { - ST s0 = 0, s1 = 0, s2 = 0; - for( i = 0; i < ksz_cn; i += 3 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - for( i = 0; i < width; i += 3 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - D[i+3] = s0; - D[i+4] = s1; - D[i+5] = s2; - } - } - else if( cn == 4 ) - { - ST s0 = 0, s1 = 0, s2 = 0, s3 = 0; - for( i = 0; i < ksz_cn; i += 4 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - s3 += (ST)S[i+3]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - D[3] = s3; - for( i = 0; i < width; i += 4 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3]; - D[i+4] = s0; - D[i+5] = s1; - D[i+6] = s2; - D[i+7] = s3; - } - } - else - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i += cn ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+cn] = s; - } - } - } -}; - - -template -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - ST* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(ST)); - - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ST* Sp = (const ST*)src[0]; - - for( i = 0; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ST* Sp = (const ST*)src[0]; - const ST* Sm = (const ST*)src[1-ksize]; - T* D = (T*)dst; - if( haveScale ) - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0*_scale); - D[i+1] = saturate_cast(s1*_scale); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0); - D[i+1] = saturate_cast(s1); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); - - v_uint16 v_dst = v_pack(v_s0d, v_s01d); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - - v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : -public BaseColumnFilter -{ - enum { SHIFT = 23 }; - - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - divDelta = 0; - divScale = 1; - if( scale != 1 ) - { - int d = cvRound(1./scale); - double scalef = ((double)(1 << SHIFT))/d; - divScale = cvFloor(scalef); - scalef -= divScale; - divDelta = d/2; - if( scalef < 0.5 ) - divDelta++; - else - divScale++; - } - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - const int ds = divScale; - const int dd = divDelta; - ushort* SUM; - const bool haveScale = scale != 1; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(SUM[0])); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - const ushort* Sm = (const ushort*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_uint32 _ds4 = vx_setall_u32((unsigned)ds); - v_uint16 _dd8 = vx_setall_u16((ushort)dd); - - for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes ) - { - v_uint16 _sm0 = vx_load(Sm + i); - v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes); - - v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i)); - v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes)); - - v_uint32 _s00, _s01, _s10, _s11; - - v_expand(_s0 + _dd8, _s00, _s01); - v_expand(_s1 + _dd8, _s10, _s11); - - _s00 = v_shr(_s00*_ds4); - _s01 = v_shr(_s01*_ds4); - _s10 = v_shr(_s10*_ds4); - _s11 = v_shr(_s11*_ds4); - - v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); - v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); - - _s0 = v_sub_wrap(_s0, _sm0); - _s1 = v_sub_wrap(_s1, _sm1); - - v_store(D + i, v_pack_u(r0, r1)); - v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16::nlanes, _s1); - } -#if CV_SIMD_WIDTH > 16 - v_uint32x4 ds4 = v_setall_u32((unsigned)ds); - v_uint16x8 dd8 = v_setall_u16((ushort)dd); - - for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes ) - { - v_uint16x8 _sm0 = v_load(Sm + i); - v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes); - - v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); - v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes)); - - v_uint32x4 _s00, _s01, _s10, _s11; - - v_expand(_s0 + dd8, _s00, _s01); - v_expand(_s1 + dd8, _s10, _s11); - - _s00 = v_shr(_s00*ds4); - _s01 = v_shr(_s01*ds4); - _s10 = v_shr(_s10*ds4); - _s11 = v_shr(_s11*ds4); - - v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); - v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); - - _s0 = v_sub_wrap(_s0, _sm0); - _s1 = v_sub_wrap(_s1, _sm1); - - v_store(D + i, v_pack_u(r0, r1)); - v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16x8::nlanes, _s1); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (uchar)((s0 + dd)*ds >> SHIFT); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - else - { - int i = 0; - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - int divDelta; - int divScale; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - short* D = (short*)dst; - if( haveScale ) - { - i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); - v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - i = 0; -#if CV_SIMD - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_store(D + i, v_pack(v_s0, v_s01)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_store(D + i, v_pack(v_s0, v_s01)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - ushort* D = (ushort*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - int* D = (int*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); - - v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - - v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - - v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - - v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = s0; - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int * Sp = (const int*)src[0]; - const int * Sm = (const int*)src[1-ksize]; - float* D = (float*)dst; - if( haveScale ) - { - int i = 0; - -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * _v_scale); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * v_scale); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; - -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; +namespace cv { #ifdef HAVE_OPENCL @@ -1396,109 +288,34 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, #endif +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(getRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) { - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_16U ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); + CV_CPU_DISPATCH(getColumnSumFilter, (sumType, dstType, ksize, anchor, scale), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) { - int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_16U ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of sum format (=%d), and destination format (=%d)", - sumType, dstType)); -} - - -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) -{ - int sdepth = CV_MAT_DEPTH(srcType); - int cn = CV_MAT_CN(srcType), sumType = CV_64F; - if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && - ksize.width*ksize.height <= 256 ) - sumType = CV_16U; - else if( sdepth <= CV_32S && (!normalize || - ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) : - sdepth == CV_16U ? (1 << 15) : (1 << 16))) ) - sumType = CV_32S; - sumType = CV_MAKETYPE( sumType, cn ); - - Ptr rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1); - - return makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); + CV_CPU_DISPATCH(createBoxFilter, (srcType, dstType, ksize, anchor, normalize, borderType), + CV_CPU_DISPATCH_MODES_ALL); } #ifdef HAVE_OPENVX -namespace cv -{ namespace ovx { template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } } @@ -1570,12 +387,9 @@ namespace cv return true; } -} #endif #if defined(HAVE_IPP) -namespace cv -{ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) { #ifdef HAVE_IPP_IW @@ -1620,13 +434,12 @@ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool nor return false; #endif } -} #endif -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) +void boxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) { CV_INSTRUMENT_REGION(); @@ -1674,8 +487,8 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, } -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) +void blur(InputArray src, OutputArray dst, + Size ksize, Point anchor, int borderType) { CV_INSTRUMENT_REGION(); @@ -1687,77 +500,17 @@ void cv::blur( InputArray src, OutputArray dst, Squared Box Filter \****************************************************************************************/ -namespace cv -{ - -template -struct SqrRowSum : - public BaseRowFilter -{ - SqrRowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - { - ST val = (ST)S[i]; - s += val*val; - } - D[0] = s; - for( i = 0; i < width; i += cn ) - { - ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn]; - s += val1*val1 - val0*val0; - D[i+cn] = s; - } - } - } -}; - static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) { - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); + CV_CPU_DISPATCH(getSqrRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) +void sqrBoxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) { CV_INSTRUMENT_REGION(); @@ -1801,4 +554,4 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, f->apply( src, dst, wsz, ofs ); } -/* End of file. */ +} // namespace diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp index 14f266258f..4eadee8ec5 100644 --- a/modules/imgproc/src/box_filter.simd.hpp +++ b/modules/imgproc/src/box_filter.simd.hpp @@ -42,21 +42,25 @@ //M*/ #include "precomp.hpp" - -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor); +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale); +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType); -namespace cv -{ +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY /****************************************************************************************\ Box Filter \****************************************************************************************/ +namespace { template struct RowSum : public BaseRowFilter @@ -70,6 +74,8 @@ struct RowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -183,6 +189,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; ST* SUM; bool haveScale = scale != 1; @@ -281,6 +289,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -408,9 +418,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -452,6 +459,8 @@ public BaseColumnFilter virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const int ds = divScale; const int dd = divDelta; ushort* SUM; @@ -586,9 +595,6 @@ public BaseColumnFilter } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -616,6 +622,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; int* SUM; bool haveScale = scale != 1; @@ -739,9 +747,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -767,6 +772,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -888,9 +895,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -915,6 +919,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1022,9 +1028,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1050,6 +1053,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1154,9 +1159,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1164,243 +1166,13 @@ struct ColumnSum : std::vector sum; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize ) + +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) { - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + CV_INSTRUMENT_REGION(); - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && - (anchor.x == 1) && (anchor.y == 1) && - (ksize.width == 3) && (ksize.height == 3)) ) - return false; - - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - char build_opts[1024]; - sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); - - ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (ddepth < 0) - ddepth = sdepth; - - if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - int computeUnits = ocl::Device::getDefault().maxComputeUnits(); - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), - wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - - size_t maxWorkItemSizes[32]; - ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); - int tryWorkItems = (int)maxWorkItemSizes[0]; - - ocl::Kernel kernel; - - if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = roundUp(globalsize[0], wgRound); - - char build_options[1024], cvt[2][40]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV - ); - - - if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - for ( ; ; ) - { - int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); - - while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) - BLOCK_SIZE_X /= 2; - while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) - BLOCK_SIZE_Y *= 2; - - if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) - return false; - - char cvt[2][50]; - String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" - " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" - " -D ST1=%s -D DT1=%s -D cn=%d", - BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), - anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], - isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); - - localsize[0] = BLOCK_SIZE_X; - globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; - globalsize[1] = divUp(size.height, BLOCK_SIZE_Y); - - kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); - if (kernel.empty()) - return false; - - size_t kernelWorkGroupSize = kernel.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) - return false; - - tryWorkItems = (int)kernelWorkGroupSize; - } - } - - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, localsize, false); -} - -#endif - -} - - -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) -{ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1434,9 +1206,10 @@ cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksi } -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); @@ -1474,9 +1247,11 @@ cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, i } -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType); int cn = CV_MAT_CN(srcType), sumType = CV_64F; if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && @@ -1496,199 +1271,12 @@ cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ks srcType, dstType, sumType, borderType ); } -#ifdef HAVE_OPENVX -namespace cv -{ - namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } - } - static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType) - { - if (ddepth < 0) - ddepth = CV_8UC1; - if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3 || - (anchor.x >= 0 && anchor.x != 1) || - (anchor.y >= 0 && anchor.y != 1) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - _dst.create(src.size(), CV_8UC1); - Mat dst = _dst.getMat(); - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; - } -} -#endif - -#if defined(HAVE_IPP) -namespace cv -{ -static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Problem with SSE42 optimization for 16s and some 8u modes - if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) - return false; - - // Other optimizations has some degradations too - if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) - return false; -#endif - - if(!normalize) - return false; - - if(!ippiCheckAnchor(anchor, ksize)) - return false; - - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiSize iwKSize = ippiGetSize(ksize); - ::ipp::IwiBorderSize borderSize(iwKSize); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); - return false; -#endif -} -} -#endif - - -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && - (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || - borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), - ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - Mat src = _src.getMat(); - int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); - if( ddepth < 0 ) - ddepth = sdepth; - _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); - Mat dst = _dst.getMat(); - if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) - { - if( src.rows == 1 ) - ksize.height = 1; - if( src.cols == 1 ) - ksize.width = 1; - } - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType&BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) - - CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); - - borderType = (borderType&~BORDER_ISOLATED); - - Ptr f = createBoxFilter( src.type(), dst.type(), - ksize, anchor, normalize, borderType ); - - f->apply( src, dst, wsz, ofs ); -} - - -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - boxFilter( src, dst, -1, ksize, anchor, true, borderType ); -} /****************************************************************************************\ Squared Box Filter \****************************************************************************************/ - -namespace cv -{ +namespace { template struct SqrRowSum : @@ -1703,6 +1291,8 @@ struct SqrRowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -1727,7 +1317,9 @@ struct SqrRowSum : } }; -static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) +} // namespace anon + +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1753,52 +1345,6 @@ static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize srcType, sumType)); } -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); - Size size = _src.size(); - - if( ddepth < 0 ) - ddepth = sdepth < CV_32F ? CV_32F : CV_64F; - - if( borderType != BORDER_CONSTANT && normalize ) - { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) - - int sumDepth = CV_64F; - if( sdepth == CV_8U ) - sumDepth = CV_32S; - int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); - - Mat src = _src.getMat(); - _dst.create( size, dstType ); - Mat dst = _dst.getMat(); - - Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, - normalize ? 1./(ksize.width*ksize.height) : 1); - - Ptr f = makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); - Point ofs; - Size wsz(src.cols, src.rows); - src.locateROI( wsz, ofs ); - - f->apply( src, dst, wsz, ofs ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 2c07c6718fd48902c23c4012f5d57cdc2c0faa59 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 11 Mar 2019 12:37:17 +0000 Subject: [PATCH 9/9] imgproc: dispatch morph --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/morph.dispatch.cpp | 794 +------------- modules/imgproc/src/morph.simd.hpp | 1327 +----------------------- 3 files changed, 68 insertions(+), 2054 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d60fa7c58f..0c7b3268df 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -7,5 +7,6 @@ ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/morph.dispatch.cpp b/modules/imgproc/src/morph.dispatch.cpp index c18e5c8066..326bc66593 100644 --- a/modules/imgproc/src/morph.dispatch.cpp +++ b/modules/imgproc/src/morph.dispatch.cpp @@ -48,779 +48,49 @@ #include "opencv2/core/hal/intrin.hpp" #include +#include "morph.simd.hpp" +#include "morph.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + + /****************************************************************************************\ Basic Morphological Operations: Erosion & Dilation \****************************************************************************************/ -using namespace std; - -namespace cv -{ - -template struct MinOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct MaxOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - -#undef CV_MIN_8U -#undef CV_MAX_8U -#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) -#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) - -template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } -template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } - -struct MorphRowNoVec -{ - MorphRowNoVec(int, int) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct MorphColumnNoVec -{ - MorphColumnNoVec(int, int) {} - int operator()(const uchar**, uchar*, int, int, int) const { return 0; } -}; - -struct MorphNoVec -{ - int operator()(uchar**, int, uchar*, int) const { return 0; } -}; - -#if CV_SIMD - -template struct MorphRowVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar* src, uchar* dst, int width, int cn) const - { - int i, k, _ksize = ksize*cn; - width *= cn; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); - vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); - for (k = cn; k < _ksize; k += cn) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - v_store((stype*)dst + i + 2*vtype::nlanes, s2); - v_store((stype*)dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - for( k = cn; k < _ksize; k += cn ) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s = vx_load((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load((const stype*)src + i + k)); - v_store((stype*)dst + i, s); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s = vx_load_low((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load_low((const stype*)src + i + k)); - v_store_low((stype*)dst + i, s); - i += vtype::nlanes/2; - } - - return i - i % cn; - } - - int ksize, anchor; -}; - - -template struct MorphColumnVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const - { - int i = 0, k, _ksize = ksize; - VecUpdate updateOp; - - for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); - - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - dststep /= sizeof(dst[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - - v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - - v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); - v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); - i += vtype::nlanes/2; - } - } - - for( ; count > 0; count--, dst += dststep, src++ ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - } - - return i; - } - - int ksize, anchor; -}; - - -template struct MorphVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - int operator()(uchar** _src, int nz, uchar* _dst, int width) const - { - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - int i, k; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - vtype s2 = vx_load(sptr + 2*vtype::nlanes); - vtype s3 = vx_load(sptr + 3*vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - return i; - } -}; - -template struct VMin -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } -}; -template struct VMax -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } -}; - -typedef MorphRowVec > ErodeRowVec8u; -typedef MorphRowVec > DilateRowVec8u; -typedef MorphRowVec > ErodeRowVec16u; -typedef MorphRowVec > DilateRowVec16u; -typedef MorphRowVec > ErodeRowVec16s; -typedef MorphRowVec > DilateRowVec16s; -typedef MorphRowVec > ErodeRowVec32f; -typedef MorphRowVec > DilateRowVec32f; - -typedef MorphColumnVec > ErodeColumnVec8u; -typedef MorphColumnVec > DilateColumnVec8u; -typedef MorphColumnVec > ErodeColumnVec16u; -typedef MorphColumnVec > DilateColumnVec16u; -typedef MorphColumnVec > ErodeColumnVec16s; -typedef MorphColumnVec > DilateColumnVec16s; -typedef MorphColumnVec > ErodeColumnVec32f; -typedef MorphColumnVec > DilateColumnVec32f; - -typedef MorphVec > ErodeVec8u; -typedef MorphVec > DilateVec8u; -typedef MorphVec > ErodeVec16u; -typedef MorphVec > DilateVec16u; -typedef MorphVec > ErodeVec16s; -typedef MorphVec > DilateVec16s; -typedef MorphVec > ErodeVec32f; -typedef MorphVec > DilateVec32f; - -#else - -typedef MorphRowNoVec ErodeRowVec8u; -typedef MorphRowNoVec DilateRowVec8u; - -typedef MorphColumnNoVec ErodeColumnVec8u; -typedef MorphColumnNoVec DilateColumnVec8u; - -typedef MorphRowNoVec ErodeRowVec16u; -typedef MorphRowNoVec DilateRowVec16u; -typedef MorphRowNoVec ErodeRowVec16s; -typedef MorphRowNoVec DilateRowVec16s; -typedef MorphRowNoVec ErodeRowVec32f; -typedef MorphRowNoVec DilateRowVec32f; - -typedef MorphColumnNoVec ErodeColumnVec16u; -typedef MorphColumnNoVec DilateColumnVec16u; -typedef MorphColumnNoVec ErodeColumnVec16s; -typedef MorphColumnNoVec DilateColumnVec16s; -typedef MorphColumnNoVec ErodeColumnVec32f; -typedef MorphColumnNoVec DilateColumnVec32f; - -typedef MorphNoVec ErodeVec8u; -typedef MorphNoVec DilateVec8u; -typedef MorphNoVec ErodeVec16u; -typedef MorphNoVec DilateVec16u; -typedef MorphNoVec ErodeVec16s; -typedef MorphNoVec DilateVec16s; -typedef MorphNoVec ErodeVec32f; -typedef MorphNoVec DilateVec32f; - -#endif - -typedef MorphRowNoVec ErodeRowVec64f; -typedef MorphRowNoVec DilateRowVec64f; -typedef MorphColumnNoVec ErodeColumnVec64f; -typedef MorphColumnNoVec DilateColumnVec64f; -typedef MorphNoVec ErodeVec64f; -typedef MorphNoVec DilateVec64f; - - -template struct MorphRowFilter : public BaseRowFilter -{ - typedef typename Op::rtype T; - - MorphRowFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int i, j, k, _ksize = ksize*cn; - const T* S = (const T*)src; - Op op; - T* D = (T*)dst; - - if( _ksize == cn ) - { - for( i = 0; i < width*cn; i++ ) - D[i] = S[i]; - return; - } - - int i0 = vecOp(src, dst, width, cn); - width *= cn; - - for( k = 0; k < cn; k++, S++, D++ ) - { - for( i = i0; i <= width - cn*2; i += cn*2 ) - { - const T* s = S + i; - T m = s[cn]; - for( j = cn*2; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = op(m, s[0]); - D[i+cn] = op(m, s[j]); - } - - for( ; i < width; i += cn ) - { - const T* s = S + i; - T m = s[0]; - for( j = cn; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = m; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphColumnFilter : public BaseColumnFilter -{ - typedef typename Op::rtype T; - - MorphColumnFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i, k, _ksize = ksize; - const T** src = (const T**)_src; - T* D = (T*)dst; - Op op; - - int i0 = vecOp(_src, dst, dststep, count, width); - dststep /= sizeof(D[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, D += dststep*2, src += 2 ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[1] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - sptr = src[0] + i; - D[i] = op(s0, sptr[0]); - D[i+1] = op(s1, sptr[1]); - D[i+2] = op(s2, sptr[2]); - D[i+3] = op(s3, sptr[3]); - - sptr = src[k] + i; - D[i+dststep] = op(s0, sptr[0]); - D[i+dststep+1] = op(s1, sptr[1]); - D[i+dststep+2] = op(s2, sptr[2]); - D[i+dststep+3] = op(s3, sptr[3]); - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[1][i]; - - for( k = 2; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - - D[i] = op(s0, src[0][i]); - D[i+dststep] = op(s0, src[k][i]); - } - } - - for( ; count > 0; count--, D += dststep, src++ ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[0][i]; - for( k = 1; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - D[i] = s0; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphFilter : BaseFilter -{ - typedef typename Op::rtype T; - - MorphFilter( const Mat& _kernel, Point _anchor ) - { - anchor = _anchor; - ksize = _kernel.size(); - CV_Assert( _kernel.type() == CV_8U ); - - std::vector coeffs; // we do not really the values of non-zero - // kernel elements, just their locations - preprocess2DKernel( _kernel, coords, coeffs ); - ptrs.resize( coords.size() ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE - { - const Point* pt = &coords[0]; - const T** kp = (const T**)&ptrs[0]; - int i, k, nz = (int)coords.size(); - Op op; - - width *= cn; - for( ; count > 0; count--, dst += dststep, src++ ) - { - T* D = (T*)dst; - - for( k = 0; k < nz; k++ ) - kp[k] = (const T*)src[pt[k].y] + pt[k].x*cn; - - i = vecOp(&ptrs[0], nz, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = kp[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < nz; k++ ) - { - sptr = kp[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = kp[0][i]; - for( k = 1; k < nz; k++ ) - s0 = op(s0, kp[k][i]); - D[i] = s0; - } - } - } - - std::vector coords; - std::vector ptrs; - VecOp vecOp; -}; - -} +namespace cv { /////////////////////////////////// External Interface ///////////////////////////////////// -cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeRowVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateRowVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyRowFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeColumnVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateColumnVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyColumnFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) +Ptr getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) { + CV_INSTRUMENT_REGION(); + Mat kernel = _kernel.getMat(); - int depth = CV_MAT_DEPTH(type); - anchor = normalizeAnchor(anchor, kernel.size()); - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, ErodeVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, ErodeVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, ErodeVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, ErodeVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, ErodeVec64f> >(kernel, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, DilateVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, DilateVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, DilateVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, DilateVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, DilateVec64f> >(kernel, anchor); - } - - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyFilter, (op, type, kernel, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createMorphologyFilter( int op, int type, InputArray _kernel, - Point anchor, int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createMorphologyFilter( + int op, int type, InputArray _kernel, + Point anchor, int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat kernel = _kernel.getMat(); anchor = normalizeAnchor(anchor, kernel.size()); @@ -862,7 +132,7 @@ cv::Ptr cv::createMorphologyFilter( int op, int type, InputArr } -cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) +Mat getStructuringElement(int shape, Size ksize, Point anchor) { int i, j; int r = 0, c = 0; @@ -915,9 +185,6 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) return elem; } -namespace cv -{ - // ===== 1. replacement implementation static bool halMorph(int op, int src_type, int dst_type, @@ -1732,9 +999,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst, (src.isSubmatrix() && !isolated)); } -} - -void cv::erode( InputArray src, OutputArray dst, InputArray kernel, +void erode( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1744,7 +1009,7 @@ void cv::erode( InputArray src, OutputArray dst, InputArray kernel, } -void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, +void dilate( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1755,8 +1020,6 @@ void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, #ifdef HAVE_OPENCL -namespace cv { - static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue) @@ -1813,13 +1076,11 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, return true; } -} #endif #define IPP_DISABLE_MORPH_ADV 1 #ifdef HAVE_IPP #if !IPP_DISABLE_MORPH_ADV -namespace cv { static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, int iterations, @@ -1884,11 +1145,10 @@ static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, return false; #endif } -} #endif #endif -void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, +void morphologyEx( InputArray _src, OutputArray _dst, int op, InputArray _kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1985,6 +1245,8 @@ void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, } } +} // namespace cv + CV_IMPL IplConvKernel * cvCreateStructuringElementEx( int cols, int rows, int anchorX, int anchorY, diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp index c18e5c8066..9b3023f8f0 100644 --- a/modules/imgproc/src/morph.simd.hpp +++ b/modules/imgproc/src/morph.simd.hpp @@ -42,21 +42,22 @@ #include "precomp.hpp" #include -#include "opencl_kernels_imgproc.hpp" -#include -#include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" -#include /****************************************************************************************\ Basic Morphological Operations: Erosion & Dilation \****************************************************************************************/ -using namespace std; +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor); -namespace cv -{ +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +namespace { template struct MinOp { typedef T type1; @@ -73,6 +74,9 @@ template struct MaxOp T operator ()(const T a, const T b) const { return std::max(a, b); } }; + +#if !defined(CV_SIMD) // min/max operation are usually fast enough (without using of control flow 'if' statements) + #undef CV_MIN_8U #undef CV_MAX_8U #define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) @@ -81,6 +85,10 @@ template struct MaxOp template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } +#endif + + + struct MorphRowNoVec { MorphRowNoVec(int, int) {} @@ -107,6 +115,8 @@ template struct MorphRowVec MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar* src, uchar* dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i, k, _ksize = ksize*cn; width *= cn; VecUpdate updateOp; @@ -173,6 +183,8 @@ template struct MorphColumnVec MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = ksize; VecUpdate updateOp; @@ -332,6 +344,8 @@ template struct MorphVec typedef typename vtype::lane_type stype; int operator()(uchar** _src, int nz, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + const stype** src = (const stype**)_src; stype* dst = (stype*)_dst; int i, k; @@ -483,6 +497,8 @@ template struct MorphRowFilter : public BaseRowFilter void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k, _ksize = ksize*cn; const T* S = (const T*)src; Op op; @@ -537,6 +553,8 @@ template struct MorphColumnFilter : public BaseColumnFilt void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, k, _ksize = ksize; const T** src = (const T**)_src; T* D = (T*)dst; @@ -638,6 +656,8 @@ template struct MorphFilter : BaseFilter void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const Point* pt = &coords[0]; const T** kp = (const T**)&ptrs[0]; int i, k, nz = (int)coords.size(); @@ -684,12 +704,14 @@ template struct MorphFilter : BaseFilter VecOp vecOp; }; -} +} // namespace anon /////////////////////////////////// External Interface ///////////////////////////////////// -cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) { + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); if( anchor < 0 ) anchor = ksize/2; @@ -734,8 +756,10 @@ cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksiz CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } -cv::Ptr cv::getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) { + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); if( anchor < 0 ) anchor = ksize/2; @@ -780,10 +804,10 @@ cv::Ptr cv::getMorphologyColumnFilter(int op, int type, in CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } - -cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); anchor = normalizeAnchor(anchor, kernel.size()); CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); @@ -817,1279 +841,6 @@ cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _ke CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } - -cv::Ptr cv::createMorphologyFilter( int op, int type, InputArray _kernel, - Point anchor, int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat kernel = _kernel.getMat(); - anchor = normalizeAnchor(anchor, kernel.size()); - - Ptr rowFilter; - Ptr columnFilter; - Ptr filter2D; - - if( countNonZero(kernel) == kernel.rows*kernel.cols ) - { - // rectangular structuring element - rowFilter = getMorphologyRowFilter(op, type, kernel.cols, anchor.x); - columnFilter = getMorphologyColumnFilter(op, type, kernel.rows, anchor.y); - } - else - filter2D = getMorphologyFilter(op, type, kernel, anchor); - - Scalar borderValue = _borderValue; - if( (_rowBorderType == BORDER_CONSTANT || _columnBorderType == BORDER_CONSTANT) && - borderValue == morphologyDefaultBorderValue() ) - { - int depth = CV_MAT_DEPTH(type); - CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_16S || - depth == CV_32F || depth == CV_64F ); - if( op == MORPH_ERODE ) - borderValue = Scalar::all( depth == CV_8U ? (double)UCHAR_MAX : - depth == CV_16U ? (double)USHRT_MAX : - depth == CV_16S ? (double)SHRT_MAX : - depth == CV_32F ? (double)FLT_MAX : DBL_MAX); - else - borderValue = Scalar::all( depth == CV_8U || depth == CV_16U ? - 0. : - depth == CV_16S ? (double)SHRT_MIN : - depth == CV_32F ? (double)-FLT_MAX : -DBL_MAX); - } - - return makePtr(filter2D, rowFilter, columnFilter, - type, type, type, _rowBorderType, _columnBorderType, borderValue ); -} - - -cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) -{ - int i, j; - int r = 0, c = 0; - double inv_r2 = 0; - - CV_Assert( shape == MORPH_RECT || shape == MORPH_CROSS || shape == MORPH_ELLIPSE ); - - anchor = normalizeAnchor(anchor, ksize); - - if( ksize == Size(1,1) ) - shape = MORPH_RECT; - - if( shape == MORPH_ELLIPSE ) - { - r = ksize.height/2; - c = ksize.width/2; - inv_r2 = r ? 1./((double)r*r) : 0; - } - - Mat elem(ksize, CV_8U); - - for( i = 0; i < ksize.height; i++ ) - { - uchar* ptr = elem.ptr(i); - int j1 = 0, j2 = 0; - - if( shape == MORPH_RECT || (shape == MORPH_CROSS && i == anchor.y) ) - j2 = ksize.width; - else if( shape == MORPH_CROSS ) - j1 = anchor.x, j2 = j1 + 1; - else - { - int dy = i - r; - if( std::abs(dy) <= r ) - { - int dx = saturate_cast(c*std::sqrt((r*r - dy*dy)*inv_r2)); - j1 = std::max( c - dx, 0 ); - j2 = std::min( c + dx + 1, ksize.width ); - } - } - - for( j = 0; j < j1; j++ ) - ptr[j] = 0; - for( ; j < j2; j++ ) - ptr[j] = 1; - for( ; j < ksize.width; j++ ) - ptr[j] = 0; - } - - return elem; -} - -namespace cv -{ - -// ===== 1. replacement implementation - -static bool halMorph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ - cvhalFilter2D * ctx; - int res = cv_hal_morphInit(&ctx, op, src_type, dst_type, width, height, - kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, - anchor_x, anchor_y, - borderType, borderValue, - iterations, isSubmatrix, src_data == dst_data); - if (res != CV_HAL_ERROR_OK) - return false; - - res = cv_hal_morph(ctx, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, - roi_x, roi_y, - roi_width2, roi_height2, - roi_x2, roi_y2); - bool success = (res == CV_HAL_ERROR_OK); - - res = cv_hal_morphFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - - return success; -} - -// ===== 2. IPP implementation -#ifdef HAVE_IPP -#ifdef HAVE_IPP_IW -static inline IwiMorphologyType ippiGetMorphologyType(int morphOp) -{ - return morphOp == MORPH_ERODE ? iwiMorphErode : - morphOp == MORPH_DILATE ? iwiMorphDilate : - morphOp == MORPH_OPEN ? iwiMorphOpen : - morphOp == MORPH_CLOSE ? iwiMorphClose : - morphOp == MORPH_GRADIENT ? iwiMorphGradient : - morphOp == MORPH_TOPHAT ? iwiMorphTophat : - morphOp == MORPH_BLACKHAT ? iwiMorphBlackhat : (IwiMorphologyType)-1; -} #endif - -static bool ippMorph(int op, int src_type, int dst_type, - const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201800 - // Problem with SSE42 optimizations performance - if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) - return false; - - // Different mask flipping - if(op == MORPH_GRADIENT) - return false; - - // Integer overflow bug - if(src_step >= IPP_MAX_32S || - src_step*height >= IPP_MAX_32S) - return false; -#endif - -#if IPP_VERSION_X100 < 201801 - // Problem with AVX512 optimizations performance - if(cv::ipp::getIppTopFeatures()&ippCPUID_AVX512F) - return false; - - // Multiple iterations on small mask is not effective in current integration - // Implace imitation for 3x3 kernel is not efficient - // Advanced morphology for small mask introduces degradations - if((iterations > 1 || src_data == dst_data || (op != MORPH_ERODE && op != MORPH_DILATE)) && kernel_width*kernel_height < 25) - return false; - - // Skip even mask sizes for advanced morphology since they can produce out of spec writes - if((op != MORPH_ERODE && op != MORPH_DILATE) && (!(kernel_width&1) || !(kernel_height&1))) - return false; -#endif - - IppAutoBuffer kernelTempBuffer; - ::ipp::IwiBorderSize iwBorderSize; - ::ipp::IwiBorderSize iwBorderSize2; - ::ipp::IwiBorderType iwBorderType; - ::ipp::IwiBorderType iwBorderType2; - ::ipp::IwiImage iwMask; - ::ipp::IwiImage iwInter; - ::ipp::IwiSize initSize(width, height); - ::ipp::IwiSize kernelSize(kernel_width, kernel_height); - IppDataType type = ippiGetDataType(CV_MAT_DEPTH(src_type)); - int channels = CV_MAT_CN(src_type); - IwiMorphologyType morphType = ippiGetMorphologyType(op); - - CV_UNUSED(isSubmatrix); - - if((int)morphType < 0) - return false; - - if(iterations > 1 && morphType != iwiMorphErode && morphType != iwiMorphDilate) - return false; - - if(src_type != dst_type) - return false; - - if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) - return false; - - try - { - ::ipp::IwiImage iwSrc(initSize, type, channels, ::ipp::IwiBorderSize(roi_x, roi_y, roi_width-roi_x-width, roi_height-roi_y-height), (void*)src_data, src_step); - ::ipp::IwiImage iwDst(initSize, type, channels, ::ipp::IwiBorderSize(roi_x2, roi_y2, roi_width2-roi_x2-width, roi_height2-roi_y2-height), (void*)dst_data, dst_step); - - iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); - if(!iwBorderType) - return false; - if(iterations > 1) - { - // Check dst border for second and later iterations - iwBorderSize2 = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType2 = ippiGetBorder(iwDst, borderType, iwBorderSize2); - if(!iwBorderType2) - return false; - } - - if(morphType != iwiMorphErode && morphType != iwiMorphDilate && morphType != iwiMorphGradient) - { - // For now complex morphology support only InMem around all sides. This will be improved later. - if((iwBorderType&ippBorderInMem) && (iwBorderType&ippBorderInMem) != ippBorderInMem) - return false; - - if((iwBorderType&ippBorderInMem) == ippBorderInMem) - { - iwBorderType &= ~ippBorderInMem; - iwBorderType &= ippBorderFirstStageInMem; - } - } - - if(iwBorderType.StripFlags() == ippBorderConst) - { - if(Vec(borderValue) == morphologyDefaultBorderValue()) - iwBorderType.SetType(ippBorderDefault); - else - iwBorderType.m_value = ::ipp::IwValueFloat(borderValue[0], borderValue[1], borderValue[2], borderValue[3]); - } - - iwMask.Init(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, kernel_data, kernel_step); - - ::ipp::IwiImage iwMaskLoc = iwMask; - if(morphType == iwiMorphDilate) - { - iwMaskLoc.Alloc(iwMask.m_size, iwMask.m_dataType, iwMask.m_channels); - ::ipp::iwiMirror(iwMask, iwMaskLoc, ippAxsBoth); - iwMask = iwMaskLoc; - } - - if(iterations > 1) - { - // OpenCV uses in mem border from dst for two and more iterations, so we need to keep this border in intermediate image - iwInter.Alloc(initSize, type, channels, iwBorderSize2); - - ::ipp::IwiImage *pSwap[2] = {&iwInter, &iwDst}; - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwInter, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - - // Copy border only - { - if(iwBorderSize2.top) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, -iwBorderSize2.top, iwDst.m_size.width+iwBorderSize2.left+iwBorderSize2.right, iwBorderSize2.top); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.bottom) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, iwDst.m_size.height, iwDst.m_size.width+iwBorderSize2.left+iwBorderSize2.right, iwBorderSize2.bottom); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.left) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, 0, iwBorderSize2.left, iwDst.m_size.height); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.right) - { - ::ipp::IwiRoi borderRoi(iwDst.m_size.width, 0, iwBorderSize2.left, iwDst.m_size.height); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - } - - iwBorderType2.SetType(iwBorderType); - for(int i = 0; i < iterations-1; i++) - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, *pSwap[i&0x1], *pSwap[(i+1)&0x1], morphType, iwMask, ::ipp::IwDefault(), iwBorderType2); - if(iterations&0x1) - CV_INSTRUMENT_FUN_IPP(::ipp::iwiCopy, iwInter, iwDst); - } - else - { - if(src_data == dst_data) - { - iwInter.Alloc(initSize, type, channels); - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwInter, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiCopy, iwInter, iwDst); - } - else - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwDst, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - } - } - catch(const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(op); CV_UNUSED(src_type); CV_UNUSED(dst_type); CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(dst_data); - CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(roi_width); CV_UNUSED(roi_height); - CV_UNUSED(roi_x); CV_UNUSED(roi_y); CV_UNUSED(roi_width2); CV_UNUSED(roi_height2); CV_UNUSED(roi_x2); CV_UNUSED(roi_y2); - CV_UNUSED(kernel_type); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); - CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(borderType); CV_UNUSED(borderValue); CV_UNUSED(iterations); - CV_UNUSED(isSubmatrix); - return false; -#endif -}; - -#endif // HAVE_IPP - -// ===== 3. Fallback implementation - -static void ocvMorph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations) -{ - Mat kernel(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - Point anchor(anchor_x, anchor_y); - Vec borderVal(borderValue); - Ptr f = createMorphologyFilter(op, src_type, kernel, anchor, borderType, borderType, borderVal); - Mat src(Size(width, height), src_type, src_data, src_step); - Mat dst(Size(width, height), dst_type, dst_data, dst_step); - { - Point ofs(roi_x, roi_y); - Size wsz(roi_width, roi_height); - f->apply( src, dst, wsz, ofs ); - } - { - Point ofs(roi_x2, roi_y2); - Size wsz(roi_width2, roi_height2); - for( int i = 1; i < iterations; i++ ) - f->apply( dst, dst, wsz, ofs ); - } -} - - -// ===== HAL interface implementation - -namespace hal { - - -CV_DEPRECATED Ptr Morph::create(int , int , int , int , int , - int , uchar * , size_t , - int , int , - int , int , - int , const double *, - int , bool , bool ) { return Ptr(); } - - -void morph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ - { - bool res = halMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations, isSubmatrix); - if (res) - return; - } - - CV_IPP_RUN_FAST(ippMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations, isSubmatrix)); - - ocvMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations); -} - -} // cv::hal - -#ifdef HAVE_OPENCL - -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -static bool ocl_morph3x3_8UC1( InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, - int op, int actual_op = -1, InputArray _extraMat = noArray()) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Size ksize = _kernel.size(); - - Mat kernel8u; - String processing; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - _kernel.getMat().convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if (actual_op < 0) - actual_op = op; - - if (type != CV_8UC1 || - !((_src.offset() == 0) && (_src.step() % 4 == 0)) || - !((_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) || - !(anchor.x == 1 && anchor.y == 1) || - !(ksize.width == 3 && ksize.height == 3)) - return false; - - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - String opts = format("-D PROCESS_ELEM_=%s -D %s%s", processing.c_str(), op2str[op], - actual_op == op ? "" : cv::format(" -D %s", op2str[actual_op]).c_str()); - - ocl::Kernel k; - k.create("morph3x3_8UC1_cols16_rows2", cv::ocl::imgproc::morph3x3_oclsrc, opts); - - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(depth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - UMat extraMat = _extraMat.getUMat(); - - int idxArg = k.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = k.set(idxArg, (int)src.step); - idxArg = k.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = k.set(idxArg, (int)dst.step); - idxArg = k.set(idxArg, (int)dst.rows); - idxArg = k.set(idxArg, (int)dst.cols); - - if (haveExtraMat) - { - idxArg = k.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(extraMat)); - } - - return k.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -static bool ocl_morphSmall( InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, int borderType, - int op, int actual_op = -1, InputArray _extraMat = noArray()) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (cn > 4 || (!doubleSupport && depth == CV_64F) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - Size ksize = _kernel.size(); - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = depth, wtype = type; - if (depth == CV_8U) - { - wdepth = CV_32S; - wtype = CV_MAKETYPE(wdepth, cn); - } - char cvt[2][40]; - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", - "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - if (actual_op < 0) - actual_op = op; - - // build processing - String processing; - Mat kernel8u; - _kernel.getMat().convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - String opts = format("-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d -D DEPTH_%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=srcT -D dstT1=srcT1 -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D PROCESS_ELEM_=%s -D %s%s", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, depth, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(depth), - haveExtraMat ? ocl::typeToStr(wtype):"srcT",//to prevent overflow - WT - haveExtraMat ? ocl::typeToStr(wdepth):"srcT1",//to prevent overflow - WT1 - haveExtraMat ? ocl::convertTypeStr(depth, wdepth, cn, cvt[0]) : "noconvert",//to prevent overflow - src to WT - haveExtraMat ? ocl::convertTypeStr(wdepth, depth, cn, cvt[1]) : "noconvert",//to prevent overflow - WT to dst - ocl::typeToStr(CV_MAKE_TYPE(haveExtraMat ? wdepth : depth, pxLoadVecSize)), //PX_LOAD_FLOAT_VEC_CONV - processing.c_str(), op2str[op], - actual_op == op ? "" : cv::format(" -D %s", op2str[actual_op]).c_str()); - - ocl::Kernel kernel("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, opts); - if (kernel.empty()) - return false; - - _dst.create(size, type); - UMat dst = _dst.getUMat(); - - UMat source; - if(src.u != dst.u) - source = src; - else - { - Point ofs; - int cols = src.cols, rows = src.rows; - src.locateROI(wholeSize, ofs); - src.adjustROI(ofs.y, wholeSize.height - rows - ofs.y, ofs.x, wholeSize.width - cols - ofs.x); - src.copyTo(source); - - src.adjustROI(-ofs.y, -wholeSize.height + rows + ofs.y, -ofs.x, -wholeSize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholeSize.height + rows + ofs.y, -ofs.x, -wholeSize.width + cols + ofs.x); - source.locateROI(wholeSize, ofs); - } - - UMat extraMat = _extraMat.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(source)); - idxArg = kernel.set(idxArg, (int)source.step); - int srcOffsetX = (int)((source.offset % source.step) / source.elemSize()); - int srcOffsetY = (int)(source.offset / source.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - - if (haveExtraMat) - { - idxArg = kernel.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(extraMat)); - } - - return kernel.run(2, globalsize, NULL, false); -} - -static bool ocl_morphOp(InputArray _src, OutputArray _dst, InputArray _kernel, - Point anchor, int iterations, int op, int borderType, - const Scalar &, int actual_op = -1, InputArray _extraMat = noArray()) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3, 3), ssize = _src.size(); - - bool doubleSupport = dev.doubleFPConfig() > 0; - if ((depth == CV_64F && !doubleSupport) || borderType != BORDER_CONSTANT) - return false; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - if (kernel.empty()) - { - ksize = Size(1+iterations*2,1+iterations*2); - kernel = getStructuringElement(MORPH_RECT, ksize); - anchor = Point(iterations, iterations); - iterations = 1; - CV_DbgAssert(ksize == kernel.size()); - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - ksize = Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)); - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, ksize, anchor); - iterations = 1; - CV_DbgAssert(ksize == kernel.size()); - } - - static bool param_use_morph_special_kernels = utils::getConfigurationParameterBool("OPENCV_OPENCL_IMGPROC_MORPH_SPECIAL_KERNEL", -#ifndef __APPLE__ - true -#else - false -#endif - ); - - int esz = CV_ELEM_SIZE(type); - // try to use OpenCL kernel adopted for small morph kernel - if (param_use_morph_special_kernels && dev.isIntel() && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1)) && - (iterations == 1) - ) - { - if (ocl_morph3x3_8UC1(_src, _dst, kernel, anchor, op, actual_op, _extraMat)) - return true; - - if (ocl_morphSmall(_src, _dst, kernel, anchor, borderType, op, actual_op, _extraMat)) - return true; - } - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return true; - } - -#ifdef __ANDROID__ - size_t localThreads[2] = { 16, 8 }; -#else - size_t localThreads[2] = { 16, 16 }; -#endif - size_t globalThreads[2] = { (size_t)ssize.width, (size_t)ssize.height }; - -#ifdef __APPLE__ - if( actual_op != MORPH_ERODE && actual_op != MORPH_DILATE ) - localThreads[0] = localThreads[1] = 4; -#endif - - if (localThreads[0]*localThreads[1] * 2 < (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)) - return false; - -#ifdef __ANDROID__ - if (dev.isNVidia()) - return false; -#endif - - // build processing - String processing; - Mat kernel8u; - kernel.convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - - char cvt[2][50]; - int wdepth = std::max(depth, CV_32F), scalarcn = cn == 3 ? 4 : cn; - - if (actual_op < 0) - actual_op = op; - - std::vector kernels(iterations); - for (int i = 0; i < iterations; i++) - { - int current_op = iterations == i + 1 ? actual_op : op; - String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s" - " -D PROCESS_ELEMS=%s -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s" - " -D convertToWT=%s -D convertToT=%s -D ST=%s%s", - anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op], - doubleSupport ? " -D DOUBLE_SUPPORT" : "", processing.c_str(), - ocl::typeToStr(type), depth, cn, ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), - ocl::typeToStr(CV_MAKE_TYPE(depth, scalarcn)), - current_op == op ? "" : cv::format(" -D %s", op2str[current_op]).c_str()); - - kernels[i].create("morph", ocl::imgproc::morph_oclsrc, buildOptions); - if (kernels[i].empty()) - return false; - } - - UMat src = _src.getUMat(), extraMat = _extraMat.getUMat(); - _dst.create(src.size(), src.type()); - UMat dst = _dst.getUMat(); - - if (iterations == 1 && src.u != dst.u) - { - Size wholesize; - Point ofs; - src.locateROI(wholesize, ofs); - int wholecols = wholesize.width, wholerows = wholesize.height; - - if (haveExtraMat) - kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows, - ocl::KernelArg::ReadOnlyNoSize(extraMat)); - else - kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows); - - return kernels[0].run(2, globalThreads, localThreads, false); - } - - for (int i = 0; i < iterations; i++) - { - UMat source; - Size wholesize; - Point ofs; - - if (i == 0) - { - int cols = src.cols, rows = src.rows; - src.locateROI(wholesize, ofs); - src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x); - if(src.u != dst.u) - source = src; - else - src.copyTo(source); - - src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - } - else - { - int cols = dst.cols, rows = dst.rows; - dst.locateROI(wholesize, ofs); - dst.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x); - dst.copyTo(source); - dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - } - source.locateROI(wholesize, ofs); - - if (haveExtraMat && iterations == i + 1) - kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height, - ocl::KernelArg::ReadOnlyNoSize(extraMat)); - else - kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height); - - if (!kernels[i].run(2, globalThreads, localThreads, false)) - return false; - } - - return true; -} - -#endif - -static void morphOp( int op, InputArray _src, OutputArray _dst, - InputArray _kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3,3); - anchor = normalizeAnchor(anchor, ksize); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 && - borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() && - (op == MORPH_ERODE || op == MORPH_DILATE) && - anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1, - ocl_morphOp(_src, _dst, kernel, anchor, iterations, op, borderType, borderValue) ) - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return; - } - - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2)); - anchor = Point(iterations, iterations); - iterations = 1; - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, - Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)), - anchor); - iterations = 1; - } - - Mat src = _src.getMat(); - _dst.create( src.size(), src.type() ); - Mat dst = _dst.getMat(); - - Point s_ofs; - Size s_wsz(src.cols, src.rows); - Point d_ofs; - Size d_wsz(dst.cols, dst.rows); - bool isolated = (borderType&BORDER_ISOLATED)?true:false; - borderType = (borderType&~BORDER_ISOLATED); - - if(!isolated) - { - src.locateROI(s_wsz, s_ofs); - dst.locateROI(d_wsz, d_ofs); - } - - hal::morph(op, src.type(), dst.type(), - src.data, src.step, - dst.data, dst.step, - src.cols, src.rows, - s_wsz.width, s_wsz.height, s_ofs.x, s_ofs.y, - d_wsz.width, d_wsz.height, d_ofs.x, d_ofs.y, - kernel.type(), kernel.data, kernel.step, kernel.cols, kernel.rows, anchor.x, anchor.y, - borderType, borderValue.val, iterations, - (src.isSubmatrix() && !isolated)); -} - -} - -void cv::erode( InputArray src, OutputArray dst, InputArray kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - morphOp( MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue ); -} - - -void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - morphOp( MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue ); -} - -#ifdef HAVE_OPENCL - -namespace cv { - -static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, - InputArray kernel, Point anchor, int iterations, - int borderType, const Scalar& borderValue) -{ - _dst.createSameSize(_src, _src.type()); - bool submat = _dst.isSubmatrix(); - UMat temp; - _OutputArray _temp = submat ? _dst : _OutputArray(temp); - - switch( op ) - { - case MORPH_ERODE: - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - break; - case MORPH_DILATE: - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - break; - case MORPH_OPEN: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - break; - case MORPH_CLOSE: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - break; - case MORPH_GRADIENT: - if (!ocl_morphOp( _src, temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_GRADIENT, temp )) - return false; - break; - case MORPH_TOPHAT: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_TOPHAT, _src )) - return false; - break; - case MORPH_BLACKHAT: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue, MORPH_BLACKHAT, _src )) - return false; - break; - default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); - } - - return true; -} - -} -#endif - -#define IPP_DISABLE_MORPH_ADV 1 -#ifdef HAVE_IPP -#if !IPP_DISABLE_MORPH_ADV -namespace cv { -static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, - InputArray _kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue) -{ -#if defined HAVE_IPP_IW - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3,3); - anchor = normalizeAnchor(anchor, ksize); - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return true; - } - - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2)); - anchor = Point(iterations, iterations); - iterations = 1; - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, - Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)), - anchor); - iterations = 1; - } - - Mat src = _src.getMat(); - _dst.create( src.size(), src.type() ); - Mat dst = _dst.getMat(); - - Point s_ofs; - Size s_wsz(src.cols, src.rows); - Point d_ofs; - Size d_wsz(dst.cols, dst.rows); - bool isolated = (borderType&BORDER_ISOLATED)?true:false; - borderType = (borderType&~BORDER_ISOLATED); - - if(!isolated) - { - src.locateROI(s_wsz, s_ofs); - dst.locateROI(d_wsz, d_ofs); - } - - return ippMorph(op, src.type(), dst.type(), - src.data, src.step, - dst.data, dst.step, - src.cols, src.rows, - s_wsz.width, s_wsz.height, s_ofs.x, s_ofs.y, - d_wsz.width, d_wsz.height, d_ofs.x, d_ofs.y, - kernel.type(), kernel.data, kernel.step, kernel.cols, kernel.rows, anchor.x, anchor.y, - borderType, borderValue.val, iterations, - (src.isSubmatrix() && !isolated)); -#else - CV_UNUSED(op); CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_kernel); CV_UNUSED(anchor); - CV_UNUSED(iterations); CV_UNUSED(borderType); CV_UNUSED(borderValue); - return false; -#endif -} -} -#endif -#endif - -void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, - InputArray _kernel, Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - Mat kernel = _kernel.getMat(); - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1)); - } -#ifdef HAVE_OPENCL - Size ksize = kernel.size(); - anchor = normalizeAnchor(anchor, ksize); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 && - anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1 && - borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue(), - ocl_morphologyEx(_src, _dst, op, kernel, anchor, iterations, borderType, borderValue)) -#endif - - Mat src = _src.getMat(), temp; - _dst.create(src.size(), src.type()); - Mat dst = _dst.getMat(); - -#if !IPP_DISABLE_MORPH_ADV - CV_IPP_RUN_FAST(ipp_morphologyEx(op, src, dst, kernel, anchor, iterations, borderType, borderValue)); -#endif - - switch( op ) - { - case MORPH_ERODE: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_DILATE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_OPEN: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue ); - dilate( dst, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_CLOSE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - erode( dst, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_GRADIENT: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue ); - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - dst -= temp; - break; - case MORPH_TOPHAT: - if( src.data != dst.data ) - temp = dst; - erode( src, temp, kernel, anchor, iterations, borderType, borderValue ); - dilate( temp, temp, kernel, anchor, iterations, borderType, borderValue ); - dst = src - temp; - break; - case MORPH_BLACKHAT: - if( src.data != dst.data ) - temp = dst; - dilate( src, temp, kernel, anchor, iterations, borderType, borderValue ); - erode( temp, temp, kernel, anchor, iterations, borderType, borderValue ); - dst = temp - src; - break; - case MORPH_HITMISS: - CV_Assert(src.type() == CV_8UC1); - if(countNonZero(kernel) <=0) - { - src.copyTo(dst); - break; - } - { - Mat k1, k2, e1, e2; - k1 = (kernel == 1); - k2 = (kernel == -1); - - if (countNonZero(k1) <= 0) - e1 = Mat(src.size(), src.type(), Scalar(255)); - else - erode(src, e1, k1, anchor, iterations, borderType, borderValue); - - if (countNonZero(k2) <= 0) - e2 = Mat(src.size(), src.type(), Scalar(255)); - else - { - Mat src_complement; - bitwise_not(src, src_complement); - erode(src_complement, e2, k2, anchor, iterations, borderType, borderValue); - } - dst = e1 & e2; - } - break; - default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); - } -} - -CV_IMPL IplConvKernel * -cvCreateStructuringElementEx( int cols, int rows, - int anchorX, int anchorY, - int shape, int *values ) -{ - cv::Size ksize = cv::Size(cols, rows); - cv::Point anchor = cv::Point(anchorX, anchorY); - CV_Assert( cols > 0 && rows > 0 && anchor.inside(cv::Rect(0,0,cols,rows)) && - (shape != CV_SHAPE_CUSTOM || values != 0)); - - int i, size = rows * cols; - int element_size = sizeof(IplConvKernel) + size*sizeof(int); - IplConvKernel *element = (IplConvKernel*)cvAlloc(element_size + 32); - - element->nCols = cols; - element->nRows = rows; - element->anchorX = anchorX; - element->anchorY = anchorY; - element->nShiftR = shape < CV_SHAPE_ELLIPSE ? shape : CV_SHAPE_CUSTOM; - element->values = (int*)(element + 1); - - if( shape == CV_SHAPE_CUSTOM ) - { - for( i = 0; i < size; i++ ) - element->values[i] = values[i]; - } - else - { - cv::Mat elem = cv::getStructuringElement(shape, ksize, anchor); - for( i = 0; i < size; i++ ) - element->values[i] = elem.ptr()[i]; - } - - return element; -} - - -CV_IMPL void -cvReleaseStructuringElement( IplConvKernel ** element ) -{ - if( !element ) - CV_Error( CV_StsNullPtr, "" ); - cvFree( element ); -} - - -static void convertConvKernel( const IplConvKernel* src, cv::Mat& dst, cv::Point& anchor ) -{ - if(!src) - { - anchor = cv::Point(1,1); - dst.release(); - return; - } - anchor = cv::Point(src->anchorX, src->anchorY); - dst.create(src->nRows, src->nCols, CV_8U); - - int i, size = src->nRows*src->nCols; - for( i = 0; i < size; i++ ) - dst.ptr()[i] = (uchar)(src->values[i] != 0); -} - - -CV_IMPL void -cvErode( const CvArr* srcarr, CvArr* dstarr, IplConvKernel* element, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - convertConvKernel( element, kernel, anchor ); - cv::erode( src, dst, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - - -CV_IMPL void -cvDilate( const CvArr* srcarr, CvArr* dstarr, IplConvKernel* element, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - convertConvKernel( element, kernel, anchor ); - cv::dilate( src, dst, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - - -CV_IMPL void -cvMorphologyEx( const void* srcarr, void* dstarr, void*, - IplConvKernel* element, int op, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - IplConvKernel* temp_element = NULL; - if (!element) - { - temp_element = cvCreateStructuringElementEx(3, 3, 1, 1, CV_SHAPE_RECT); - } else { - temp_element = element; - } - convertConvKernel( temp_element, kernel, anchor ); - if (!element) - { - cvReleaseStructuringElement(&temp_element); - } - cv::morphologyEx( src, dst, op, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace