diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index e4d13af1a2..bda1d8558f 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -346,11 +346,37 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu) OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh) +/* Load and zero expand a 4 byte value into the second dword, first is don't care. */ +#if !defined(CV_COMPILER_VSX_BROKEN_ASM) + #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory"); +#else + /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */ + #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr)); +#endif + inline v_uint32x4 v_load_expand_q(const uchar* ptr) -{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); } +{ + // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel + // Likewise note, value is zero extended and upper 4 bytes are zero'ed. + vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12}; + vec_uchar16 out; + + _LXSIWZX(out, ptr, vec_uchar16); + out = vec_perm(out, out, pmu); + return v_uint32x4((vec_uint4)out); +} inline v_int32x4 v_load_expand_q(const schar* ptr) -{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); } +{ + vec_char16 out; + vec_short8 outs; + vec_int4 outw; + + _LXSIWZX(out, ptr, vec_char16); + outs = vec_unpackl(out); + outw = vec_unpackh(outs); + return v_int32x4(outw); +} /* pack */ #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \ diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp index 0705108503..236955bd66 100644 --- a/modules/imgproc/perf/perf_resize.cpp +++ b/modules/imgproc/perf/perf_resize.cpp @@ -7,6 +7,31 @@ namespace opencv_test { typedef tuple MatInfo_Size_Size_t; typedef TestBaseWithParam MatInfo_Size_Size; +typedef tuple Size_Size_t; +typedef tuple MatInfo_SizePair_t; +typedef TestBaseWithParam MatInfo_SizePair; + +#define MATTYPE_NE_VALUES CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, \ + CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, \ + CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4 + +// For gradient-ish testing of the other matrix formats +template +static void fillFPGradient(Mat& img) +{ + const int ch = img.channels(); + + int r, c, i; + for(r=0; r(GetParam()); + Size_Size_t sizes = get<1>(GetParam()); + Size from = get<0>(sizes); + Size to = get<1>(sizes); + + cv::Mat src(from, matType), dst(to, matType); + switch(src.depth()) + { + case CV_8U: cvtest::fillGradient(src); break; + case CV_16U: fillFPGradient(src); break; + case CV_32F: fillFPGradient(src); break; + } + declare.in(src).out(dst); + + TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR); + + SANITY_CHECK_NOTHING(); +} + PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear, testing::Values( MatInfo_Size_Size_t(CV_8UC1, szVGA, szQVGA), @@ -80,6 +132,40 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear, #endif } +PERF_TEST_P(MatInfo_SizePair, resizeDownLinearNonExact, + testing::Combine + ( + testing::Values( MATTYPE_NE_VALUES ), + testing::Values + ( + Size_Size_t(szVGA, szQVGA), + Size_Size_t(szqHD, szVGA), + Size_Size_t(sz720p, Size(120 * sz720p.width / sz720p.height, 120)), + Size_Size_t(sz720p, szVGA), + Size_Size_t(sz720p, szQVGA) + ) + ) + ) +{ + int matType = get<0>(GetParam()); + Size_Size_t sizes = get<1>(GetParam()); + Size from = get<0>(sizes); + Size to = get<1>(sizes); + + cv::Mat src(from, matType), dst(to, matType); + switch(src.depth()) + { + case CV_8U: cvtest::fillGradient(src); break; + case CV_16U: fillFPGradient(src); break; + case CV_32F: fillFPGradient(src); break; + } + declare.in(src).out(dst); + + TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR); + + SANITY_CHECK_NOTHING(); +} + typedef tuple MatInfo_Size_Scale_t; typedef TestBaseWithParam MatInfo_Size_Scale; diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index b0283e5ca4..56f06ac4b7 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -1481,10 +1481,320 @@ typedef VResizeNoVec VResizeLanczos4Vec_32f; #endif +#if CV_SIMD128 + +template +struct HResizeLinearVec_X4 +{ + int operator()(const uchar** _src, uchar** _dst, int count, const int* xofs, + const uchar* _alpha, int, int, int cn, int, int xmax) const + { + const ST **src = (const ST**)_src; + const AT *alpha = (const AT*)_alpha; + DT **dst = (DT**)_dst; + const int nlanes = 4; + const int len0 = xmax & -nlanes; + int dx = 0, k = 0; + + for( ; k <= (count - 2); k+=2 ) + { + const ST *S0 = src[k]; + DT *D0 = dst[k]; + const ST *S1 = src[k+1]; + DT *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += nlanes ) + { + int sx0 = xofs[dx+0]; + int sx1 = xofs[dx+1]; + int sx2 = xofs[dx+2]; + int sx3 = xofs[dx+3]; + DVT a_even; + DVT a_odd; + + v_load_deinterleave(&alpha[dx*2], a_even, a_odd); + DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]); + DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]); + DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]); + DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]); + v_store(&D1[dx], s0_u * a_even + s1_u * a_odd); + v_store(&D0[dx], s0 * a_even + s1 * a_odd); + } + } + for( ; k < count; k++ ) + { + const ST *S = src[k]; + DT *D = dst[k]; + for( dx = 0; dx < len0; dx += nlanes ) + { + int sx0 = xofs[dx+0]; + int sx1 = xofs[dx+1]; + int sx2 = xofs[dx+2]; + int sx3 = xofs[dx+3]; + DVT a_even; + DVT a_odd; + + v_load_deinterleave(&alpha[dx*2], a_even, a_odd); + DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]); + DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]); + v_store(&D[dx], s0 * a_even + s1 * a_odd); + } + } + return dx; + } +}; + +struct HResizeLinearVecU8_X4 +{ + int operator()(const uchar** src, uchar** _dst, int count, const int* xofs, + const uchar* _alpha, int, int, int cn, int, int xmax) const + { + const short *alpha = (const short*)_alpha; + int **dst = (int**)_dst; + int dx = 0, k = 0; + + if(cn == 1) + { + const int step = 8; + const int len0 = xmax & -step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 al = v_load(alpha+dx*2); + v_int16x8 ah = v_load(alpha+dx*2+8); + v_uint16x8 sl, sh; + v_expand(v_lut_pairs(S0, xofs+dx), sl, sh); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + v_expand(v_lut_pairs(S1, xofs+dx), sl, sh); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 al = v_load(alpha+dx*2); + v_int16x8 ah = v_load(alpha+dx*2+8); + v_uint16x8 sl, sh; + v_expand(v_lut_pairs(S, xofs+dx), sl, sh); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + } + } + } + else if(cn == 2) + { + const int step = 8; + const int len0 = xmax & -step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 al = v_load(alpha+dx*2); + v_int16x8 ah = v_load(alpha+dx*2+8); + v_uint16x8 sl, sh; + v_expand(v_interleave_pairs(v_lut_quads(S0, xofs+dx)), sl, sh); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + v_expand(v_interleave_pairs(v_lut_pairs(S1, xofs+dx)), sl, sh); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 al = v_load(alpha+dx*2); + v_int16x8 ah = v_load(alpha+dx*2+8); + v_uint16x8 sl, sh; + v_expand(v_interleave_pairs(v_lut_quads(S, xofs+dx)), sl, sh); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al)); + v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah)); + } + } + } + else if(cn == 3) + { + const int step = 4; + const int len0 = xmax - step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += 3*step/4 ) + { + v_int16x8 a = v_load(alpha+dx*2); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a)); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += 3*step/4 ) + { + v_int16x8 a = v_load(alpha+dx*2); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a)); + } + } + } + else if(cn == 4) + { + const int step = 4; + const int len0 = xmax & -step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 a = v_load(alpha+dx*2); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a)); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 a = v_load(alpha+dx*2); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a)); + } + } + } + else if(cn < 9) + { + const int step = 8; + const int len0 = xmax & -step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += cn ) + { + v_int16x8 a0 = v_load(alpha+dx*2); + v_int16x8 a1 = v_load(alpha+dx*2 + 8); + v_uint16x8 s0, s1; + v_zip(v_load_expand(S0+xofs[dx]), v_load_expand(S0+xofs[dx]+cn), s0, s1); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); + v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); + v_zip(v_load_expand(S1+xofs[dx]), v_load_expand(S1+xofs[dx]+cn), s0, s1); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); + v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += cn ) + { + v_int16x8 a0 = v_load(alpha+dx*2); + v_int16x8 a1 = v_load(alpha+dx*2 + 8); + v_uint16x8 s0, s1; + v_zip(v_load_expand(S+xofs[dx]), v_load_expand(S+xofs[dx]+cn), s0, s1); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); + v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); + } + } + } + else + { + const int step = 16; + const int len0 = (xmax - cn) & -step; + for( ; k <= (count - 2); k+=2 ) + { + const uchar *S0 = src[k]; + int *D0 = dst[k]; + const uchar *S1 = src[k+1]; + int *D1 = dst[k+1]; + + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 a0 = v_load(alpha+dx*2); + v_int16x8 a1 = v_load(alpha+dx*2 + 8); + v_int16x8 a2 = v_load(alpha+dx*2 + 16); + v_int16x8 a3 = v_load(alpha+dx*2 + 24); + v_uint8x16 s01, s23; + v_zip(v_lut(S0, xofs+dx), v_lut(S0+cn, xofs+dx), s01, s23); + v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); + v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); + v_store(&D0[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); + v_store(&D0[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); + v_zip(v_lut(S1, xofs+dx), v_lut(S1+cn, xofs+dx), s01, s23); + v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); + v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); + v_store(&D1[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); + v_store(&D1[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); + } + } + for( ; k < count; k++ ) + { + const uchar *S = src[k]; + int *D = dst[k]; + for( dx = 0; dx < len0; dx += step ) + { + v_int16x8 a0 = v_load(alpha+dx*2); + v_int16x8 a1 = v_load(alpha+dx*2 + 8); + v_int16x8 a2 = v_load(alpha+dx*2 + 16); + v_int16x8 a3 = v_load(alpha+dx*2 + 24); + v_uint8x16 s01, s23; + v_zip(v_lut(S, xofs+dx), v_lut(S+cn, xofs+dx), s01, s23); + v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); + v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); + v_store(&D[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); + v_store(&D[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); + } + } + } + return dx; + } +}; + +typedef HResizeLinearVec_X4 HResizeLinearVec_32f; +typedef HResizeLinearVec_X4 HResizeLinearVec_16u32f; +typedef HResizeLinearVec_X4 HResizeLinearVec_16s32f; +typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s; + +#else + typedef HResizeNoVec HResizeLinearVec_8u32s; typedef HResizeNoVec HResizeLinearVec_16u32f; typedef HResizeNoVec HResizeLinearVec_16s32f; typedef HResizeNoVec HResizeLinearVec_32f; + +#endif + typedef HResizeNoVec HResizeLinearVec_64f; @@ -1505,7 +1815,7 @@ struct HResizeLinear int dx0 = vecOp((const uchar**)src, (uchar**)dst, count, xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax ); - for( k = 0; k <= count - 2; k++ ) + for( k = 0; k <= count - 2; k+=2 ) { const T *S0 = src[k], *S1 = src[k+1]; WT *D0 = dst[k], *D1 = dst[k+1]; @@ -1529,7 +1839,7 @@ struct HResizeLinear { const T *S = src[k]; WT *D = dst[k]; - for( dx = 0; dx < xmax; dx++ ) + for( dx = dx0; dx < xmax; dx++ ) { int sx = xofs[dx]; D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];