// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. /* StackBlur - a fast almost Gaussian Blur Theory: http://underdestruction.com/2004/02/25/stackblur-2004 The code has been borrowed from (https://github.com/flozz/StackBlur) and adapted for OpenCV by Zihao Mu. Below is the original copyright */ /* Copyright (c) 2010 Mario Klingemann Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "precomp.hpp" #include "opencv2/core/hal/intrin.hpp" #include using namespace std; #define STACKBLUR_MAX_RADIUS 254 static unsigned short const stackblurMul[255] = { 512,512,456,512,328,456,335,512,405,328,271,456,388,335,292,512, 454,405,364,328,298,271,496,456,420,388,360,335,312,292,273,512, 482,454,428,405,383,364,345,328,312,298,284,271,259,496,475,456, 437,420,404,388,374,360,347,335,323,312,302,292,282,273,265,512, 497,482,468,454,441,428,417,405,394,383,373,364,354,345,337,328, 320,312,305,298,291,284,278,271,265,259,507,496,485,475,465,456, 446,437,428,420,412,404,396,388,381,374,367,360,354,347,341,335, 329,323,318,312,307,302,297,292,287,282,278,273,269,265,261,512, 505,497,489,482,475,468,461,454,447,441,435,428,422,417,411,405, 399,394,389,383,378,373,368,364,359,354,350,345,341,337,332,328, 324,320,316,312,309,305,301,298,294,291,287,284,281,278,274,271, 268,265,262,259,257,507,501,496,491,485,480,475,470,465,460,456, 451,446,442,437,433,428,424,420,416,412,408,404,400,396,392,388, 385,381,377,374,370,367,363,360,357,354,350,347,344,341,338,335, 332,329,326,323,320,318,315,312,310,307,304,302,299,297,294,292, 289,287,285,282,280,278,275,273,271,269,267,265,263,261,259 }; static unsigned char const stackblurShr[255] = { 9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 }; namespace cv{ #if CV_SIMD template inline int opRow(const T* , T* , const std::vector& , const float , const int radius, const int CN, const int ) { return radius * CN; } template<> inline int opRow(const uchar* srcPtr, uchar* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) { int kernelSize = (int)kVec.size(); int i = radius * CN; if (radius > STACKBLUR_MAX_RADIUS) return i; const int mulValTab= stackblurMul[radius]; const int shrValTab= stackblurShr[radius]; const int VEC_LINE = v_uint8::nlanes; if (kernelSize == 3) { v_uint32 v_mulVal = vx_setall_u32(mulValTab); for (; i <= widthCN - VEC_LINE; i += VEC_LINE) { v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(srcPtr + i - CN), x0l, x0h); v_expand(vx_load(srcPtr + i), x1l, x1h); v_expand(vx_load(srcPtr + i + CN), x2l, x2h); x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); v_uint32 y00, y01, y10, y11; v_expand(x1l, y00, y01); v_expand(x1h, y10, y11); y00 = (y00 * v_mulVal)>>shrValTab; y01 = (y01 * v_mulVal)>>shrValTab; y10 = (y10 * v_mulVal)>>shrValTab; y11 = (y11 * v_mulVal)>>shrValTab; v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11))); } } else { const ushort * kx = kVec.data() + kernelSize/2; v_int32 v_mulVal = vx_setall_s32(mulValTab); v_int16 k0 = vx_setall_s16((short)(kx[0])); srcPtr += i; for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) { v_uint8 v_src = vx_load(srcPtr); v_int32 s0, s1, s2, s3; v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); int k = 1, j = CN; for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) { v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32(((int)kx[k] & 0xFFFF) | ((int)kx[k + 1] << 16))); v_uint8 v_src0 = vx_load(srcPtr - j); v_uint8 v_src1 = vx_load(srcPtr - j - CN); v_uint8 v_src2 = vx_load(srcPtr + j); v_uint8 v_src3 = vx_load(srcPtr + j + CN); v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); s0 += v_dotprod(xl, k12); s1 += v_dotprod(xh, k12); v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); s2 += v_dotprod(xl, k12); s3 += v_dotprod(xh, k12); } if( k < kernelSize / 2 + 1 ) { v_int16 k1 = vx_setall_s16((short)(kx[k])); v_uint8 v_src0 = vx_load(srcPtr - j); v_uint8 v_src1 = vx_load(srcPtr + j); v_int16 xl, xh; v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); s0 += v_dotprod(xl, k1); s1 += v_dotprod(xh, k1); v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); s2 += v_dotprod(xl, k1); s3 += v_dotprod(xh, k1); } s0 = (s0 * v_mulVal)>>shrValTab; s1 = (s1 * v_mulVal)>>shrValTab; s2 = (s2 * v_mulVal)>>shrValTab; s3 = (s3 * v_mulVal)>>shrValTab; v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3)))); } } return i; } template<> inline int opRow(const ushort* srcPtr, ushort* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) { int kernelSize = (int)kVec.size(); int i = radius * CN; if (radius > STACKBLUR_MAX_RADIUS) return i; const int mulValTab= stackblurMul[radius]; const int shrValTab= stackblurShr[radius]; const int VEC_LINE = v_uint16::nlanes; v_uint32 v_mulVal = vx_setall_u32(mulValTab); if (kernelSize == 3) { for (; i <= widthCN - VEC_LINE; i += VEC_LINE) { v_uint32 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(srcPtr + i - CN), x0l, x0h); v_expand(vx_load(srcPtr + i), x1l, x1h); v_expand(vx_load(srcPtr + i + CN), x2l, x2h); x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab)); } } else { const ushort * kx = kVec.data() + kernelSize/2; v_uint16 k0 = vx_setall_u16(kx[0]); srcPtr += i; for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) { v_uint16 v_src = vx_load(srcPtr); v_uint32 s0, s1; v_mul_expand(v_src, k0, s0, s1); int k = 1, j = CN; for (; k <= kernelSize / 2 - 1; k += 2, j += 2*CN) { v_uint16 k1 = vx_setall_u16(kx[k]); v_uint16 k2 = vx_setall_u16(kx[k + 1]); v_uint32 y0, y1; v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); s0 += y0; s1 += y1; v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1); s0 += y0; s1 += y1; } if( k < kernelSize / 2 + 1 ) { v_uint16 k1 = vx_setall_u16(kx[k]); v_uint32 y0, y1; v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); s0 += y0; s1 += y1; } s0 = (s0 * v_mulVal)>>shrValTab; s1 = (s1 * v_mulVal)>>shrValTab; v_store(dstPtr + i, v_pack(s0, s1)); } } return i; } template<> inline int opRow(const short* srcPtr, short* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) { int kernelSize = (int)kVec.size(); int i = radius * CN; if (radius > STACKBLUR_MAX_RADIUS) return i; const int mulValTab= stackblurMul[radius]; const int shrValTab= stackblurShr[radius]; const int VEC_LINE = v_int16::nlanes; v_int32 v_mulVal = vx_setall_s32(mulValTab); if (kernelSize == 3) { for (; i <= widthCN - VEC_LINE; i += VEC_LINE) { v_int32 x0l, x0h, x1l, x1h, x2l, x2h; v_expand(vx_load(srcPtr + i - CN), x0l, x0h); v_expand(vx_load(srcPtr + i), x1l, x1h); v_expand(vx_load(srcPtr + i + CN), x2l, x2h); x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab)); } } else { const ushort * kx = kVec.data() + kernelSize/2; v_int16 k0 = vx_setall_s16((short)(kx[0])); srcPtr += i; for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) { v_int16 v_src = vx_load(srcPtr); v_int32 s0, s1; v_mul_expand(v_src, k0, s0, s1); int k = 1, j = CN; for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) { v_int16 k1 = vx_setall_s16((short)kx[k]); v_int16 k2 = vx_setall_s16((short)kx[k + 1]); v_int32 y0, y1; v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); s0 += y0; s1 += y1; v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1); s0 += y0; s1 += y1; } if( k < kernelSize / 2 + 1 ) { v_int16 k1 = vx_setall_s16((short)kx[k]); v_int32 y0, y1; v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); s0 += y0; s1 += y1; } s0 = (s0 * v_mulVal)>>shrValTab; s1 = (s1 * v_mulVal)>>shrValTab; v_store(dstPtr + i, v_pack(s0, s1)); } } return i; } template<> inline int opRow(const float* srcPtr, float* dstPtr, const std::vector& kVec, const float mulVal, const int radius, const int CN, const int widthCN) { int kernelSize = (int)kVec.size(); int i = radius * CN; v_float32 v_mulVal = vx_setall_f32(mulVal); const int VEC_LINE = v_float32::nlanes; const int VEC_LINE4 = VEC_LINE * 4; if (kernelSize == 3) { for (; i <= widthCN - VEC_LINE4; i += VEC_LINE4) { v_float32 v_srcPtr0 = vx_load(srcPtr + i); v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + i) ; v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i); v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i); v_float32 v_sumVal0 = v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN); v_float32 v_sumVal1 = v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN); v_float32 v_sumVal2 = v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN); v_float32 v_sumVal3 = v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN); v_store(dstPtr + i, v_sumVal0 * v_mulVal); v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal); v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal); v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal); } for (; i <= widthCN - VEC_LINE; i += VEC_LINE) { v_float32 v_srcPtr = vx_load(srcPtr + i); v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN); v_store(dstPtr + i, v_sumVal * v_mulVal); } } else { const ushort * kx = kVec.data() + kernelSize/2; v_float32 k0 = vx_setall_f32((float)(kx[0])); srcPtr += i; for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) { v_float32 v_src = vx_load(srcPtr); v_float32 s0; s0 = v_src * k0; int k = 1, j = CN; for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) { v_float32 k1 = vx_setall_f32((float)kx[k]); v_float32 k2 = vx_setall_f32((float)kx[k + 1]); s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1; s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2; } if( k < kernelSize / 2 + 1 ) { v_float32 k1 = vx_setall_f32((float)kx[k]); s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1; } v_store(dstPtr + i, s0 * v_mulVal); } } return i; } template inline int opComputeDiff(const T*& , TBuf*& , const int , const int) { return 0; } template<> inline int opComputeDiff(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1) { int index = 0; const int VEC_LINE_8 = v_uint8::nlanes; const int VEC_LINE_32 = v_int32::nlanes; for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8) { v_uint16 x0l, x0h, x1l, x1h; v_expand(vx_load(srcPtr + CNR1), x0l, x0h); v_expand(vx_load(srcPtr), x1l, x1h); v_int32 y0, y1, y2, y3; v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1); v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3); v_store(diff0, y0); v_store(diff0 + VEC_LINE_32, y1); v_store(diff0 + VEC_LINE_32 * 2, y2); v_store(diff0 + VEC_LINE_32 * 3, y3); } return index; } #endif template class ParallelStackBlurRow : public ParallelLoopBody { public: ParallelStackBlurRow (const Mat &_src, Mat &_dst, int _radius): src(_src), dst(_dst) ,radius(_radius) { width= dst.cols; wm = width - 1; mulVal = 1.0f / ((radius + 1) * (radius + 1)); CN = src.channels(); } ~ParallelStackBlurRow() {} ParallelStackBlurRow& operator=(const ParallelStackBlurRow &) { return *this; } /* * The idea is as follows: * The stack can be understood as a sliding window of length kernel size. * The sliding window moves one element at a time from left to right. * The sumIn stores the elements added to the stack each time, * and sumOut stores the subtracted elements. Every time stack moves, stack, sumIn and sumOut are updated. * The dst will be calculated using the following formula: * dst[i] = (stack + sumIn - sumOut) / stack_num * In the Row direction, in order to avoid redundant computation, * we save the sumIn - sumOut as a diff vector. * So the new formula is: * dst[i] = (stack + diff[i]) / stack_num. * In practice, we use multiplication and bit shift right to simulate integer division: * dst[i] = ((stack + diff[i]) * mulVal) >> shrVal. * */ virtual void operator ()(const Range& range) const CV_OVERRIDE { const int kernelSize = 2 * radius + 1; if (kernelSize <= 9 && width > kernelSize) // Special branch for small kernel { std::vector kVec; for (int i = 0; i < kernelSize; i++) { if (i <= radius) kVec.push_back(ushort(i + 1)); else kVec.push_back(ushort(2 * radius - i + 1)); } const ushort * kx = kVec.data() + kernelSize/2; for (int row = range.start; row < range.end; row++) { const T* srcPtr = src.ptr(row); T* dstPtr = dst.ptr(row); TBuf sumVal; // init for (int i = 0; i < radius; i++) { for (int ci = 0; ci < CN; ci++) { sumVal = 0; for (int k = 0; k < kernelSize; k++) { int index = std::max(k - radius + i, 0); sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; } dstPtr[i*CN + ci] = (T)(sumVal * mulVal); } } int widthCN = (width - radius) * CN; // middle int wc = radius * CN; #if CV_SIMD wc = opRow(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN); #endif for (; wc < widthCN; wc++) { sumVal = srcPtr[wc] * kx[0]; for (int k = 1; k <= radius; k++) sumVal += ((TBuf)(srcPtr[wc + k * CN])+(TBuf)(srcPtr[wc - k * CN])) * (TBuf)kx[k]; dstPtr[wc] = (T)(sumVal * mulVal); } // tail for (int i = wc / CN; i < width; i++) { for (int ci = 0; ci < CN; ci++) { sumVal = 0; for (int k = 0; k < kernelSize; k++) { int index = std::min(k - radius + i, wm); sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; } dstPtr[i*CN + ci] = (T)(sumVal * mulVal); } } } } else { size_t bufSize = CN * (width + radius) * sizeof(TBuf) + 2 * CN * sizeof(TBuf); AutoBuffer _buf(bufSize + 16); uchar* bufptr = alignPtr(_buf.data(), 16); TBuf* diffVal = (TBuf*)bufptr; TBuf* sum = diffVal+CN; TBuf* diff = sum + CN; const int CNR1 = CN * (radius + 1); const int widthCN = (width - radius - 1) * CN; for (int row = range.start; row < range.end; row++) { memset(bufptr, 0, bufSize); const T* srcPtr = src.ptr(row); T* dstPtr = dst.ptr(row); int radiusMul = (radius + 2) * (radius + 1) / 2; for (int ci = 0; ci < CN; ci++) sum[ci] += (TBuf)srcPtr[ci] * radiusMul; // compute diff const T* srcPtr0 = srcPtr; // init for (int i = 0; i < radius; i++) { if (i < wm) srcPtr0 += CN; for (int ci = 0; ci < CN; ci++) { diff[i*CN + ci] = (TBuf)srcPtr0[ci] - (TBuf)srcPtr[ci]; diffVal[ci] += diff[i*CN + ci]; sum[ci] += srcPtr0[ci] * (radius - i); } } // middle auto diff0 = diff + radius * CN; int index = 0; #if CV_SIMD index = opComputeDiff(srcPtr, diff0, widthCN, CNR1); #endif for (; index < widthCN; index++, diff0++, srcPtr++) diff0[0] = (TBuf)(srcPtr[CNR1]) - (TBuf)(srcPtr[0]); // tails srcPtr0 = src.ptr(row) + index; const T* srcPtr1 = src.ptr(row) + (width - 1) * CN; int dist = width - index/CN; for (int r = 0; r < radius; r++, diff0 += CN) { for (int ci = 0; ci < CN; ci++) diff0[ci] = (TBuf)(srcPtr1[ci]) - (TBuf)(srcPtr0[ci]); if (dist >= r) { srcPtr0 += CN; dist--; } } srcPtr = src.ptr(row); diff0 = diff + radius * CN; for (int ci = 0; ci < CN; ci++) diffVal[ci] += diff0[ci]; diff0 += CN; if (CN == 1) { for (int i = 0; i < width; i++, diff0 ++, dstPtr ++, srcPtr ++) { *(dstPtr) = saturate_cast((sum[0] * mulVal)); sum[0] += diffVal[0]; diffVal[0] += (diff0[0] - diff0[-CNR1]); } } else if (CN == 3) { for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) { *(dstPtr + 0) = saturate_cast((sum[0] * mulVal)); *(dstPtr + 1) = saturate_cast((sum[1] * mulVal)); *(dstPtr + 2) = saturate_cast((sum[2] * mulVal)); sum[0] += diffVal[0]; sum[1] += diffVal[1]; sum[2] += diffVal[2]; diffVal[0] += (diff0[0] - diff0[0 - CNR1]); diffVal[1] += (diff0[1] - diff0[1 - CNR1]); diffVal[2] += (diff0[2] - diff0[2 - CNR1]); } } else if (CN == 4) { for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) { *(dstPtr + 0) = saturate_cast((sum[0] * mulVal)); *(dstPtr + 1) = saturate_cast((sum[1] * mulVal)); *(dstPtr + 2) = saturate_cast((sum[2] * mulVal)); *(dstPtr + 3) = saturate_cast((sum[3] * mulVal)); sum[0] += diffVal[0]; sum[1] += diffVal[1]; sum[2] += diffVal[2]; sum[3] += diffVal[3]; diffVal[0] += (diff0[0] - diff0[0 - CNR1]); diffVal[1] += (diff0[1] - diff0[1 - CNR1]); diffVal[2] += (diff0[2] - diff0[2 - CNR1]); diffVal[3] += (diff0[3] - diff0[3 - CNR1]); } } else { int i = 0; for (; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) { for (int ci = 0; ci < CN; ci++) { *(dstPtr + ci) = saturate_cast((sum[ci] * mulVal)); sum[ci] += diffVal[ci]; diffVal[ci] += (diff0[ci] - diff0[ci - CNR1]); } } } } } } private: const Mat &src; Mat &dst; int radius; int width; int wm; int CN; float mulVal; }; #if CV_SIMD template inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float , const int , const int , const int , const int , const int ) { return 0; } template<> inline int opColumn(const float* srcPtr, float* dstPtr, float* stack, float* sum, float* sumIn, float* sumOut, const float mulVal, const int , const int , const int widthLen, const int ss, const int sp1) { int k = 0; v_float32 v_mulVal = vx_setall_f32(mulVal); const int VEC_LINE = v_float32::nlanes; const int VEC_LINE4 = 4 * VEC_LINE; auto stackStartPtr = stack + ss * widthLen; auto stackSp1Ptr = stack + sp1 * widthLen; for (;k <= widthLen - VEC_LINE4; k += VEC_LINE4) { v_float32 v_sum0 = vx_load(sum + k); v_float32 v_sum1 = vx_load(sum + VEC_LINE + k); v_float32 v_sum2 = vx_load(sum + VEC_LINE * 2 + k); v_float32 v_sum3 = vx_load(sum + VEC_LINE * 3 + k); v_float32 v_sumOut0 = vx_load(sumOut + k); v_float32 v_sumOut1 = vx_load(sumOut + VEC_LINE + k); v_float32 v_sumOut2 = vx_load(sumOut + VEC_LINE * 2 + k); v_float32 v_sumOut3 = vx_load(sumOut + VEC_LINE * 3 + k); v_float32 v_sumIn0 = vx_load(sumIn + k); v_float32 v_sumIn1 = vx_load(sumIn + VEC_LINE + k); v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k); v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k); v_store(dstPtr + k, v_sum0 * v_mulVal); v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal); v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal); v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal); v_sum0 -= v_sumOut0; v_sum1 -= v_sumOut1; v_sum2 -= v_sumOut2; v_sum3 -= v_sumOut3; v_sumOut0 -= vx_load(stackStartPtr + k); v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k); v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k); v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k); v_float32 v_srcPtr0 = vx_load(srcPtr + k); v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k); v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + k); v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + k); v_store(stackStartPtr + k, v_srcPtr0); v_store(stackStartPtr + VEC_LINE + k, v_srcPtr1); v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2); v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3); v_sumIn0 += v_srcPtr0; v_sumIn1 += v_srcPtr1; v_sumIn2 += v_srcPtr2; v_sumIn3 += v_srcPtr3; v_store(sum + k, v_sum0 + v_sumIn0); v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1); v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2); v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3); v_srcPtr0 = vx_load(stackSp1Ptr + k); v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k); v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 + k); v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k); v_sumOut0 += v_srcPtr0; v_sumOut1 += v_srcPtr1; v_sumOut2 += v_srcPtr2; v_sumOut3 += v_srcPtr3; v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE + k, v_sumOut1); v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2); v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3); v_sumIn0 -= v_srcPtr0; v_sumIn1 -= v_srcPtr1; v_sumIn2 -= v_srcPtr2; v_sumIn3 -= v_srcPtr3; v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE + k, v_sumIn1); v_store(sumIn + VEC_LINE * 2 + k, v_sumIn2); v_store(sumIn + VEC_LINE * 3 + k, v_sumIn3); } for (;k <= widthLen - VEC_LINE; k += VEC_LINE) { v_float32 v_sum = vx_load(sum + k); v_float32 v_sumOut = vx_load(sumOut + k); v_float32 v_sumIn = vx_load(sumIn + k); v_store(dstPtr + k, v_sum * v_mulVal); v_sum -= v_sumOut; v_sumOut -= vx_load(stackStartPtr + k); v_float32 v_srcPtr = vx_load(srcPtr + k); v_store(stackStartPtr + k, v_srcPtr); v_sumIn += v_srcPtr; v_store(sum + k, v_sum + v_sumIn); v_srcPtr = vx_load(stackSp1Ptr + k); v_sumOut += v_srcPtr; v_store(sumOut + k, v_sumOut); v_sumIn -= v_srcPtr; v_store(sumIn + k, v_sumIn); } return k; } template<> inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack, int* sum, int* sumIn, int* sumOut, const float , const int mulValTab, const int shrValTab, const int widthLen, const int ss, const int sp1) { int k = 0; if (mulValTab != 0 && shrValTab != 0) { const int VEC_LINE_8 = v_uint8::nlanes; const int VEC_LINE_32 = v_int32::nlanes; v_int32 v_mulVal = vx_setall_s32(mulValTab); auto stackStartPtr = stack + ss * widthLen; auto stackSp1Ptr = stack + sp1 * widthLen; for (;k <= widthLen - VEC_LINE_8; k += VEC_LINE_8) { v_int32 v_sum0, v_sum1, v_sum2, v_sum3; v_int32 v_sumIn0, v_sumIn1, v_sumIn2, v_sumIn3; v_int32 v_sumOut0, v_sumOut1, v_sumOut2, v_sumOut3; v_sum0 = vx_load(sum + k); v_sum1 = vx_load(sum + k + VEC_LINE_32); v_sum2 = vx_load(sum + k + VEC_LINE_32 * 2); v_sum3 = vx_load(sum + k + VEC_LINE_32 * 3); v_sumIn0 = vx_load(sumIn + k); v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); v_sumIn2 = vx_load(sumIn + k + VEC_LINE_32 * 2); v_sumIn3 = vx_load(sumIn + k + VEC_LINE_32 * 3); v_sumOut0 = vx_load(sumOut + k); v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); v_sumOut2 = vx_load(sumOut + k + VEC_LINE_32 * 2); v_sumOut3 = vx_load(sumOut + k + VEC_LINE_32 * 3); v_store(dstPtr + k, v_pack( v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)), v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab)))); v_sum0 -= v_sumOut0; v_sum1 -= v_sumOut1; v_sum2 -= v_sumOut2; v_sum3 -= v_sumOut3; v_uint16 x0l, x0h; v_int32 v_ss0, v_ss1, v_ss2, v_ss3; v_expand(vx_load(stackStartPtr + k), x0l, x0h); v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); v_sumOut0 -= v_ss0; v_sumOut1 -= v_ss1; v_sumOut2 -= v_ss2; v_sumOut3 -= v_ss3; v_expand(vx_load(srcPtr + k), x0l, x0h); v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar)); v_sumIn0 += v_ss0; v_sumIn1 += v_ss1; v_sumIn2 += v_ss2; v_sumIn3 += v_ss3; v_store(sum + k, v_sum0 + v_sumIn0); v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1); v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2); v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3); v_expand(vx_load(stackSp1Ptr + k), x0l, x0h); v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); v_sumOut0 += v_ss0; v_sumOut1 += v_ss1; v_sumOut2 += v_ss2; v_sumOut3 += v_ss3; v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2); v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3); v_sumIn0 -= v_ss0; v_sumIn1 -= v_ss1; v_sumIn2 -= v_ss2; v_sumIn3 -= v_ss3; v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); v_store(sumIn + VEC_LINE_32 * 2 + k, v_sumIn2); v_store(sumIn + VEC_LINE_32 * 3 + k, v_sumIn3); } } return k; } template<> inline int opColumn(const short* srcPtr, short* dstPtr, short* stack, int* sum, int* sumIn, int* sumOut, const float , const int mulValTab, const int shrValTab, const int widthLen, const int ss, const int sp1) { int k = 0; if (mulValTab != 0 && shrValTab != 0) { const int VEC_LINE_16 = v_int16::nlanes; const int VEC_LINE_32 = v_int32::nlanes; v_int32 v_mulVal = vx_setall_s32(mulValTab); auto stackStartPtr = stack + ss * widthLen; auto stackSp1Ptr = stack + sp1 * widthLen; for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) { v_int32 v_sum0, v_sum1; v_int32 v_sumIn0, v_sumIn1; v_int32 v_sumOut0, v_sumOut1; v_sum0 = vx_load(sum + k); v_sum1 = vx_load(sum + k + VEC_LINE_32); v_sumIn0 = vx_load(sumIn + k); v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); v_sumOut0 = vx_load(sumOut + k); v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)); v_sum0 -= v_sumOut0; v_sum1 -= v_sumOut1; v_int32 v_ss0, v_ss1; v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); v_sumOut0 -= v_ss0; v_sumOut1 -= v_ss1; v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short)); v_sumIn0 += v_ss0; v_sumIn1 += v_ss1; v_sum0 += v_sumIn0; v_sum1 += v_sumIn1; v_store(sum + k, v_sum0); v_store(sum + VEC_LINE_32 + k, v_sum1); v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); v_sumOut0 += v_ss0; v_sumOut1 += v_ss1; v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); v_sumIn0 -= v_ss0; v_sumIn1 -= v_ss1; v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); } } return k; } template<> inline int opColumn(const ushort* srcPtr, ushort* dstPtr, ushort* stack, int* sum, int* sumIn, int* sumOut, const float , const int mulValTab, const int shrValTab, const int widthLen, const int ss, const int sp1) { int k = 0; if (mulValTab != 0 && shrValTab != 0) { const int VEC_LINE_16 = v_uint16::nlanes; const int VEC_LINE_32 = v_int32::nlanes; v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab); auto stackStartPtr = stack + ss * widthLen; auto stackSp1Ptr = stack + sp1 * widthLen; for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) { v_int32 v_sum0, v_sum1; v_int32 v_sumIn0, v_sumIn1; v_int32 v_sumOut0, v_sumOut1; v_sum0 = vx_load(sum + k); v_sum1 = vx_load(sum + k + VEC_LINE_32); v_sumIn0 = vx_load(sumIn + k); v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); v_sumOut0 = vx_load(sumOut + k); v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab)); v_sum0 -= v_sumOut0; v_sum1 -= v_sumOut1; v_uint32 v_ss0, v_ss1; v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); v_sumOut0 -= v_reinterpret_as_s32(v_ss0); v_sumOut1 -= v_reinterpret_as_s32(v_ss1); v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort)); v_sumIn0 += v_reinterpret_as_s32(v_ss0); v_sumIn1 += v_reinterpret_as_s32(v_ss1); v_sum0 += v_sumIn0; v_sum1 += v_sumIn1; v_store(sum + k, v_sum0); v_store(sum + VEC_LINE_32 + k, v_sum1); v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); v_sumOut0 += v_reinterpret_as_s32(v_ss0); v_sumOut1 += v_reinterpret_as_s32(v_ss1); v_store(sumOut + k, v_sumOut0); v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); v_sumIn0 -= v_reinterpret_as_s32(v_ss0); v_sumIn1 -= v_reinterpret_as_s32(v_ss1); v_store(sumIn + k, v_sumIn0); v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); } } return k; } #endif template class ParallelStackBlurColumn: public ParallelLoopBody { public: ParallelStackBlurColumn (const Mat & _src, Mat &_dst, int _radius):src(_src), dst(_dst) ,radius(_radius) { CN = src.channels(); widthElem = CN * src.cols; height = src.rows; hm = src.rows - 1; mulVal = 1.0f / ((radius + 1)*(radius + 1)); if (radius <= STACKBLUR_MAX_RADIUS) { shrValTab = stackblurShr[radius]; mulValTab = stackblurMul[radius]; } else { shrValTab = 0; mulValTab = 0; } } ~ParallelStackBlurColumn() {} ParallelStackBlurColumn& operator=(const ParallelStackBlurColumn &) { return *this; } virtual void operator ()(const Range& range) const CV_OVERRIDE { if (radius == 0) return; const int kernelSize = 2 * radius + 1; int widthImg = std::min(range.end, src.cols * CN); int widthLen = widthImg - range.start; size_t bufSize = 3 * widthLen * sizeof(TBuf) + kernelSize * widthLen * sizeof(T); AutoBuffer _buf(bufSize + 16); uchar* bufptr = alignPtr(_buf.data(), 16); TBuf* sum = (TBuf *)bufptr; TBuf* sumIn = sum + widthLen; TBuf* sumOut = sumIn + widthLen; T* stack = (T* )(sumOut + widthLen); memset(bufptr, 0, bufSize); const T* srcPtr =dst.ptr() + range.start; for (int i = 0; i <= radius; i++) { for (int k = 0; k < widthLen; k++) { stack[i * widthLen + k] = *(srcPtr + k); sum[k] += *(srcPtr + k) * (i + 1); sumOut[k] += *(srcPtr + k); } } for (int i = 1; i <= radius; i++) { if (i <= hm) srcPtr += widthElem; for (int k = 0; k < widthLen; k++) { T tmp = *(srcPtr + k); stack[(i + radius) * widthLen + k] = tmp; sum[k] += tmp * (radius - i + 1); sumIn[k] += tmp; } } int sp = radius; int yp = radius; if (yp > hm) yp = hm; T* dstPtr = dst.ptr() + range.start; srcPtr = dst.ptr(yp) + range.start; int stackStart = 0; for(int i = 0; i < height; i++) { stackStart = sp + kernelSize - radius; if (stackStart >= kernelSize) stackStart -= kernelSize; int sp1 = sp + 1; sp1 &= -(sp1 < kernelSize); if (yp < hm) { yp++; srcPtr += widthElem; } int k = 0; #if CV_SIMD k = opColumn(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab, widthLen, stackStart, sp1); #endif for (; k < widthLen; k++) { *(dstPtr + k) = static_cast(sum[k] * mulVal); sum[k] -= sumOut[k]; sumOut[k] -= stack[stackStart * widthLen + k]; stack[stackStart * widthLen + k] = *(srcPtr + k); sumIn[k] += *(srcPtr + k); sum[k] += sumIn[k]; sumOut[k] += stack[sp1 * widthLen + k]; sumIn[k] -= stack[sp1 * widthLen + k]; } dstPtr += widthElem; ++sp; if (sp >= kernelSize) sp = 0; } } private: const Mat &src; Mat &dst; int radius; int CN; int height; int widthElem; int hm; float mulVal; int mulValTab; int shrValTab; }; void stackBlur(InputArray _src, OutputArray _dst, Size ksize) { CV_INSTRUMENT_REGION(); CV_Assert(!_src.empty()); CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 ); int radiusH = ksize.height / 2; int radiusW = ksize.width / 2; int stype = _src.type(), sdepth = _src.depth(); Mat src = _src.getMat(); if (ksize.width == 1) { _src.copyTo(_dst); if (ksize.height == 1) return; } else { _dst.create( src.size(), stype); } Mat dst = _dst.getMat(); int numOfThreads = getNumThreads(); int widthElem = src.cols * src.channels(); if (dst.rows / numOfThreads < 3) numOfThreads = std::max(1, dst.rows / 3); if (sdepth == CV_8U) { if (ksize.width != 1) parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); if (ksize.height != 1) parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); } else if (sdepth == CV_16S) { if (ksize.width != 1) parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); if (ksize.height != 1) parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); } else if (sdepth == CV_16U) { if (ksize.width != 1) parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); if (ksize.height != 1) parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); } else if (sdepth == CV_32F) { if (ksize.width != 1) parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); if (ksize.height != 1) parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); } else CV_Error_( CV_StsNotImplemented, ("Unsupported input format in StackBlur, the supported formats are: CV_8U, CV_16U, CV_16S and CV_32F.")); } } //namespace