diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 683e4dee5c..5436a78ab5 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -1181,583 +1181,34 @@ struct HResizeNoVec const uchar*, int, int, int, int, int) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD struct VResizeLinearVec_32s8u { int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - const int** src = (const int**)_src; const short* beta = (const short*)_beta; const int *S0 = src[0], *S1 = src[1]; int x = 0; - __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); - __m128i delta = _mm_set1_epi16(2); + v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]); - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x ) >> 4, vx_load_aligned(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load_aligned(S1 + x ) >> 4, vx_load_aligned(S1 + x + v_int32::nlanes) >> 4), b1), + v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); else - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } - - for( ; x < width - 4; x += 4 ) - { - __m128i x0, y0; - x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); - y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); - x0 = _mm_packs_epi32(x0, x0); - y0 = _mm_packs_epi32(y0, y0); - x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + x) = _mm_cvtsi128_si32(x0); - } - - return x; - } -}; - - -template struct VResizeLinearVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_load_ps(S0 + x + 8); - x1 = _mm_load_ps(S0 + x + 12); - y0 = _mm_load_ps(S1 + x + 8); - y1 = _mm_load_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_loadu_ps(S0 + x + 8); - x1 = _mm_loadu_ps(S0 + x + 12); - y0 = _mm_loadu_ps(S1 + x + 8); - y1 = _mm_loadu_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - - for( ; x < width - 4; x += 4 ) - { - __m128 x0, y0; - __m128i t0; - x0 = _mm_loadu_ps(S0 + x); - y0 = _mm_loadu_ps(S1 + x); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); - _mm_storel_epi64( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeLinearVec_32f16 VResizeLinearVec_32f16u; -typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; - -struct VResizeLinearVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - - return x; - } -}; - - -struct VResizeCubicVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - int x = 0; - float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); - __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), - b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_load_si128((const __m128i*)(S2 + x)); - x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S3 + x)); - y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - - return x; - } -}; - - -template struct VResizeCubicVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - __m128i t0, t1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); - t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); - - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); - _mm_storeu_si128( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeCubicVec_32f16 VResizeCubicVec_32f16u; -typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; - -struct VResizeCubicVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - _mm_storeu_ps( dst + x, s0); - _mm_storeu_ps( dst + x + 4, s1); - } - - return x; - } -}; - -#if CV_TRY_SSE4_1 - -struct VResizeLanczos4Vec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); - else return 0; - } -}; - -#else - -typedef VResizeNoVec VResizeLanczos4Vec_32f16u; - -#endif - -struct VResizeLanczos4Vec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); - - __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); - __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); - } - - return x; - } -}; - - -struct VResizeLanczos4Vec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - float* dst = (float*)_dst; - int x = 0; - - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 4; x += 4 ) - { - __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - _mm_storeu_ps(dst + x, v_dst); - } - - return x; - } -}; - - -#elif CV_NEON - -struct VResizeLinearVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; - const short* beta = (const short*)_beta; - int x = 0; - int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); - - for( ; x <= width - 16; x += 16) - { - int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); - int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); - - int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); - - v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); - v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); - v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); - v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); - - v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); - - vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); - } + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x ) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x ) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1), + v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); + + for( ; x < width - v_int16::nlanes; x += v_int16::nlanes) + v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1)); return x; } @@ -1773,18 +1224,20 @@ struct VResizeLinearVec_32f16u ushort* dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), + v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + else + for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); + for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_store_low(dst + x, v_pack_u(t0, t0)); } return x; @@ -1801,18 +1254,20 @@ struct VResizeLinearVec_32f16s short* dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), + v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + else + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); + for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_store_low(dst + x, v_pack(t0, t0)); } return x; @@ -1829,22 +1284,56 @@ struct VResizeLinearVec_32f float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); - vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); - } + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1)); + else + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); return x; } }; -typedef VResizeNoVec VResizeCubicVec_32s8u; + +struct VResizeCubicVec_32s8u +{ + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const + { + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + + v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale), + b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x )), b0, + v_muladd(v_cvt_f32(vx_load_aligned(S1 + x )), b1, + v_muladd(v_cvt_f32(vx_load_aligned(S2 + x )), b2, + v_cvt_f32(vx_load_aligned(S3 + x )) * b3)))), + v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)), b0, + v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)), b1, + v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)), b2, + v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3)))))); + else + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x )), b0, + v_muladd(v_cvt_f32(vx_load(S1 + x )), b1, + v_muladd(v_cvt_f32(vx_load(S2 + x )), b2, + v_cvt_f32(vx_load(S3 + x )) * b3)))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)), b0, + v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)), b1, + v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)), b2, + v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3)))))); + return x; + } +}; struct VResizeCubicVec_32f16u { @@ -1855,23 +1344,18 @@ struct VResizeCubicVec_32f16u const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; ushort* dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } + for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + vx_load(S3 + x ) * b3)))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + vx_load(S3 + x + v_float32::nlanes) * b3)))))); return x; } @@ -1886,23 +1370,18 @@ struct VResizeCubicVec_32f16s const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; short* dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + vx_load(S3 + x ) * b3)))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + vx_load(S3 + x + v_float32::nlanes) * b3)))))); return x; } @@ -1917,25 +1396,33 @@ struct VResizeCubicVec_32f const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x))); - vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4))); - } + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, + v_muladd(vx_load(S1 + x), b1, + v_muladd(vx_load(S2 + x), b2, + vx_load(S3 + x) * b3)))); return x; } }; + +#if CV_TRY_SSE4_1 + +struct VResizeLanczos4Vec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); + else return 0; + } +}; + +#else + struct VResizeLanczos4Vec_32f16u { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const @@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; ushort * dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + v_muladd(vx_load(S3 + x ), b3, + v_muladd(vx_load(S4 + x ), b4, + v_muladd(vx_load(S5 + x ), b5, + v_muladd(vx_load(S6 + x ), b6, + vx_load(S7 + x ) * b7)))))))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, + v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, + v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, + v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, + vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); return x; } }; +#endif + struct VResizeLanczos4Vec_32f16s { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const @@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; short * dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + v_muladd(vx_load(S3 + x ), b3, + v_muladd(vx_load(S4 + x ), b4, + v_muladd(vx_load(S5 + x ), b5, + v_muladd(vx_load(S6 + x ), b6, + vx_load(S7 + x ) * b7)))))))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, + v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, + v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, + v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, + vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); return x; } @@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); - for( ; x <= width - 4; x += 4 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); - } + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); + + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, + v_muladd(vx_load(S1 + x), b1, + v_muladd(vx_load(S2 + x), b2, + v_muladd(vx_load(S3 + x), b3, + v_muladd(vx_load(S4 + x), b4, + v_muladd(vx_load(S5 + x), b5, + v_muladd(vx_load(S6 + x), b6, + vx_load(S7 + x) * b7)))))))); return x; } @@ -2695,95 +2166,94 @@ private: int step; }; -#elif CV_SSE2 +#elif CV_SIMD class ResizeAreaFastVec_SIMD_8u { public: ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const uchar* S, uchar* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const uchar* S0 = S; const uchar* S1 = S0 + step; - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi16(2); if (cn == 1) { - __m128i masklow = _mm_set1_epi16(0x00ff); - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + v_uint16 masklow = vx_setall_u16(0x00ff); + for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0)); + v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow)); } } else if (cn == 3) - for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) + { + if (CV_SIMD_WIDTH > 64) + return 0; + for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)(D+3), s0); + v_uint16 t0, t1, t2, t3, t4, t5; + v_uint16 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_uint16::nlanes) + vx_load_expand(S1 + v_uint16::nlanes); + s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes); + s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes); + s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes); + s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_uint16 bl, gl, rl; +#if CV_SIMD_WIDTH == 16 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#elif CV_SIMD_WIDTH == 32 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#elif CV_SIMD_WIDTH == 64 + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#endif + s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes); + s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes); + s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes); + s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes); + s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes); + s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_uint16 bh, gh, rh; +#if CV_SIMD_WIDTH == 16 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#elif CV_SIMD_WIDTH == 32 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#elif CV_SIMD_WIDTH == 64 + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); } + } else { CV_Assert(cn == 4); - int v[] = { 0, 0, -1, -1 }; - __m128i mask = _mm_loadu_si128((const __m128i*)v); - - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + v_uint32 r00, r01, r10, r11; + v_load_deinterleave((uint32_t*)S0, r00, r01); + v_load_deinterleave((uint32_t*)S1, r10, r11); - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res0 = _mm_srli_epi16(s0, 2); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res1 = _mm_srli_epi16(s0, 2); - s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), - _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); - _mm_storel_epi64((__m128i*)(D), s0); + v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_u8(r00), r00l, r00h); + v_expand(v_reinterpret_as_u8(r01), r01l, r01h); + v_expand(v_reinterpret_as_u8(r10), r10l, r10h); + v_expand(v_reinterpret_as_u8(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); } } @@ -2792,7 +2262,6 @@ public: private: int cn; - bool use_simd; int step; }; @@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u { public: ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const ushort* S, ushort* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const ushort* S0 = (const ushort*)S; const ushort* S1 = (const ushort*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); - -#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) if (cn == 1) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_uint32 masklow = vx_setall_u32(0x0000ffff); + for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srli_epi32(s0, 2); - s0 = _mm_packus_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0)); + v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow)); } } else if (cn == 3) + { +#if CV_SIMD_WIDTH == 16 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) +#if CV_SSE4_1 { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r0, r1, r2, r3; + v_expand(vx_load(S0), r0, r1); + v_expand(vx_load(S1), r2, r3); + r0 += r2; r1 += r3; + v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0)); } +#else + v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); +#endif +#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 + for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + { + v_uint32 t0, t1, t2, t3, t4, t5; + v_uint32 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_uint32::nlanes) + vx_load_expand(S1 + v_uint32::nlanes); + s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes); + s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes); + s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes); + s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_uint32 bl, gl, rl; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#endif + s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes); + s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes); + s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes); + s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes); + s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes); + s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_uint32 bh, gh, rh; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#elif CV_SIMD_WIDTH >= 64 + v_uint32 masklow = vx_setall_u32(0x0000ffff); + for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + { + v_uint16 b0, g0, r0, b1, g1, r1; + v_load_deinterleave(S0, b0, g0, r0); + v_load_deinterleave(S1, b1, g1, r1); + v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); + v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); + v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); + v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0); + v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1); + v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); + v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); + v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#endif + } else { CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) +#if CV_SIMD_WIDTH >= 64 + for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + v_uint64 r00, r01, r10, r11; + v_load_deinterleave((uint64_t*)S0, r00, r01); + v_load_deinterleave((uint64_t*)S1, r10, r11); - __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); - __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_u16(r00), r00l, r00h); + v_expand(v_reinterpret_as_u16(r01), r01l, r01h); + v_expand(v_reinterpret_as_u16(r10), r10l, r10h); + v_expand(v_reinterpret_as_u16(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); } +#else + for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) + { + v_uint32 r0, r1, r2, r3; + v_expand(vx_load(S0), r0, r1); + v_expand(vx_load(S1), r2, r3); + r0 += r2; r1 += r3; + v_uint32 v_d; +#if CV_SIMD_WIDTH == 16 + v_d = r0 + r1; +#elif CV_SIMD_WIDTH == 32 + v_uint32 t0, t1; + v_recombine(r0, r1, t0, t1); + v_d = t0 + t1; +#endif + v_rshr_pack_store<2>(D, v_d); + } +#endif } -#undef _mm_packus_epi32 - return dx; } private: int cn; int step; - bool use_simd; }; class ResizeAreaFastVec_SIMD_16s { public: ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const short* S, short* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const short* S0 = (const short*)S; const short* S1 = (const short*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); if (cn == 1) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_int32 masklow = vx_setall_s32(0x0000ffff); + for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); - __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srai_epi32(s0, 2); - s0 = _mm_packs_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_int32 r0 = v_reinterpret_as_s32(vx_load(S0)); + v_int32 r1 = v_reinterpret_as_s32(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16)); } } else if (cn == 3) + { +#if CV_SIMD_WIDTH == 16 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); +#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 + for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); - __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_int32 t0, t1, t2, t3, t4, t5; + v_int32 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); + s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); + s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); + s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes); + s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_int32 bl, gl, rl; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#endif + s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes); + s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes); + s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes); + s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes); + s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes); + s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_int32 bh, gh, rh; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); } +#elif CV_SIMD_WIDTH >= 64 + for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) + { + v_int16 b0, g0, r0, b1, g1, r1; + v_load_deinterleave(S0, b0, g0, r0); + v_load_deinterleave(S1, b1, g1, r1); + v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); + v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); + v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); + v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0); + v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1); + v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); + v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); + v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#endif + } else { CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); +#if CV_SIMD_WIDTH >= 64 + v_int64 r00, r01, r10, r11; + v_load_deinterleave((int64_t*)S0, r00, r01); + v_load_deinterleave((int64_t*)S1, r10, r11); - __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); - __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_s16(r00), r00l, r00h); + v_expand(v_reinterpret_as_s16(r01), r01l, r01h); + v_expand(v_reinterpret_as_s16(r10), r10l, r10h); + v_expand(v_reinterpret_as_s16(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); +#else + v_int32 r0, r1, r2, r3; + r0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + r1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); + r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); + r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); + v_int32 dl, dh; +#if CV_SIMD_WIDTH == 16 + dl = r0 + r1; dh = r2 + r3; +#elif CV_SIMD_WIDTH == 32 + v_int32 t0, t1, t2, t3; + v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3); + dl = t0 + t1; dh = t2 + t3; +#endif + v_store(D, v_rshr_pack<2>(dl, dh)); +#endif } } @@ -2967,7 +2530,6 @@ public: private: int cn; int step; - bool use_simd; }; struct ResizeAreaFastVec_SIMD_32f @@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f cn(_cn), step(_step) { fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); - fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); } int operator() (const float * S, float * D, int w) const @@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); int dx = 0; - __m128 v_025 = _mm_set1_ps(0.25f); - if (cn == 1) { - const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_float32 v_025 = vx_setall_f32(0.25f); + for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) { - __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), - v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); - - __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), - _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); - __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), - _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + v_float32 v_row00, v_row01, v_row10, v_row11; + v_load_deinterleave(S0, v_row00, v_row01); + v_load_deinterleave(S1, v_row10, v_row11); + v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025); } } else if (cn == 4) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) +#if CV_SIMD_WIDTH == 16 + v_float32 v_025 = vx_setall_f32(0.25f); + for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) + v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025); +#elif CV_SIMD256 + v_float32x8 v_025 = v256_setall_f32(0.25f); + for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes) { - __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); - __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + v_float32x8 dst0, dst1; + v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1); + v_store(D, (dst0 + dst1) * v_025); } +#endif } return dx;