mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 22:44:02 +08:00
Merge pull request #3384 from ilya-lavrenov:neon_new
This commit is contained in:
commit
5efad375e0
@ -605,6 +605,48 @@ inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
|
||||
return vcvtq_u32_f32(vaddq_f32(v, v_05));
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vrecpq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vrecp_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t reciprocal = vrecpe_f32(val);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t e = vrsqrteq_f32(val);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t e = vrsqrte_f32(val);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
|
||||
}
|
||||
|
||||
inline float32x2_t cv_vsqrt_f32(float32x2_t val)
|
||||
{
|
||||
return cv_vrecp_f32(cv_vrsqrt_f32(val));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // cv
|
||||
|
@ -168,6 +168,31 @@ static void FastAtan2_32f(const float *Y, const float *X, float *angle, int len,
|
||||
_mm_storeu_ps(angle + i, a);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
|
||||
float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
|
||||
float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
|
||||
float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
|
||||
float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
|
||||
|
||||
for( ; i <= len - 4; i += 4 )
|
||||
{
|
||||
float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
|
||||
float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
|
||||
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
|
||||
float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
|
||||
float32x4_t c2 = vmulq_f32(c, c);
|
||||
float32x4_t a = vmulq_f32(c2, p7);
|
||||
a = vmulq_f32(vaddq_f32(a, p5), c2);
|
||||
a = vmulq_f32(vaddq_f32(a, p3), c2);
|
||||
a = vmulq_f32(vaddq_f32(a, p1), c);
|
||||
|
||||
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
|
||||
a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
|
||||
a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
|
||||
|
||||
vst1q_f32(angle + i, vmulq_f32(a, scale4));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < len; i++ )
|
||||
@ -268,17 +293,15 @@ static void Magnitude_32f(const float* x, const float* y, float* mag, int len)
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float CV_DECL_ALIGNED(16) m[4];
|
||||
|
||||
for( ; i <= len - 4; i += 4 )
|
||||
{
|
||||
float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i);
|
||||
vst1q_f32(m, vaddq_f32(vmulq_f32(v_x, v_x), vmulq_f32(v_y, v_y)));
|
||||
|
||||
mag[i] = std::sqrt(m[0]);
|
||||
mag[i+1] = std::sqrt(m[1]);
|
||||
mag[i+2] = std::sqrt(m[2]);
|
||||
mag[i+3] = std::sqrt(m[3]);
|
||||
vst1q_f32(mag + i, cv_vsqrtq_f32(vmlaq_f32(vmulq_f32(v_x, v_x), v_y, v_y)));
|
||||
}
|
||||
for( ; i <= len - 2; i += 2 )
|
||||
{
|
||||
float32x2_t v_x = vld1_f32(x + i), v_y = vld1_f32(y + i);
|
||||
vst1_f32(mag + i, cv_vsqrt_f32(vmla_f32(vmul_f32(v_x, v_x), v_y, v_y)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -370,6 +393,12 @@ static void InvSqrt_32f(const float* src, float* dst, int len)
|
||||
_mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
vst1q_f32(dst + i, cv_vrsqrtq_f32(vld1q_f32(src + i)));
|
||||
vst1q_f32(dst + i + 4, cv_vrsqrtq_f32(vld1q_f32(src + i + 4)));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < len; i++ )
|
||||
@ -428,6 +457,12 @@ static void Sqrt_32f(const float* src, float* dst, int len)
|
||||
_mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
vst1q_f32(dst + i, cv_vsqrtq_f32(vld1q_f32(src + i)));
|
||||
vst1q_f32(dst + i + 4, cv_vsqrtq_f32(vld1q_f32(src + i + 4)));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; i < len; i++ )
|
||||
@ -869,11 +904,24 @@ void polarToCart( InputArray src1, InputArray src2,
|
||||
|
||||
SinCos_32f( angle, y, x, len, angleInDegrees );
|
||||
if( mag )
|
||||
for( k = 0; k < len; k++ )
|
||||
{
|
||||
k = 0;
|
||||
|
||||
#if CV_NEON
|
||||
for( ; k <= len - 4; k += 4 )
|
||||
{
|
||||
float32x4_t v_m = vld1q_f32(mag + k);
|
||||
vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m));
|
||||
vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; k < len; k++ )
|
||||
{
|
||||
float m = mag[k];
|
||||
x[k] *= m; y[k] *= m;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -2121,12 +2169,230 @@ void log( InputArray _src, OutputArray _dst )
|
||||
* P O W E R *
|
||||
\****************************************************************************************/
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct iPow_SIMD
|
||||
{
|
||||
int operator() ( const T *, T *, int, int)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_NEON
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<uchar, int>
|
||||
{
|
||||
int operator() ( const uchar * src, uchar * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
uint32x4_t v_1 = vdupq_n_u32(1u);
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
uint32x4_t v_a1 = v_1, v_a2 = v_1;
|
||||
uint16x8_t v_src = vmovl_u8(vld1_u8(src + i));
|
||||
uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 = vmulq_u32(v_a1, v_b1);
|
||||
v_a2 = vmulq_u32(v_a2, v_b2);
|
||||
}
|
||||
v_b1 = vmulq_u32(v_b1, v_b1);
|
||||
v_b2 = vmulq_u32(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 = vmulq_u32(v_a1, v_b1);
|
||||
v_a2 = vmulq_u32(v_a2, v_b2);
|
||||
vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))));
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<schar, int>
|
||||
{
|
||||
int operator() ( const schar * src, schar * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
int32x4_t v_1 = vdupq_n_s32(1);
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
int32x4_t v_a1 = v_1, v_a2 = v_1;
|
||||
int16x8_t v_src = vmovl_s8(vld1_s8(src + i));
|
||||
int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 = vmulq_s32(v_a1, v_b1);
|
||||
v_a2 = vmulq_s32(v_a2, v_b2);
|
||||
}
|
||||
v_b1 = vmulq_s32(v_b1, v_b1);
|
||||
v_b2 = vmulq_s32(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 = vmulq_s32(v_a1, v_b1);
|
||||
v_a2 = vmulq_s32(v_a2, v_b2);
|
||||
vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))));
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<ushort, int>
|
||||
{
|
||||
int operator() ( const ushort * src, ushort * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
uint32x4_t v_1 = vdupq_n_u32(1u);
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
uint32x4_t v_a1 = v_1, v_a2 = v_1;
|
||||
uint16x8_t v_src = vld1q_u16(src + i);
|
||||
uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 = vmulq_u32(v_a1, v_b1);
|
||||
v_a2 = vmulq_u32(v_a2, v_b2);
|
||||
}
|
||||
v_b1 = vmulq_u32(v_b1, v_b1);
|
||||
v_b2 = vmulq_u32(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 = vmulq_u32(v_a1, v_b1);
|
||||
v_a2 = vmulq_u32(v_a2, v_b2);
|
||||
vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)));
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<short, int>
|
||||
{
|
||||
int operator() ( const short * src, short * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
int32x4_t v_1 = vdupq_n_s32(1);
|
||||
|
||||
for ( ; i <= len - 8; i += 8)
|
||||
{
|
||||
int32x4_t v_a1 = v_1, v_a2 = v_1;
|
||||
int16x8_t v_src = vld1q_s16(src + i);
|
||||
int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 = vmulq_s32(v_a1, v_b1);
|
||||
v_a2 = vmulq_s32(v_a2, v_b2);
|
||||
}
|
||||
v_b1 = vmulq_s32(v_b1, v_b1);
|
||||
v_b2 = vmulq_s32(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 = vmulq_s32(v_a1, v_b1);
|
||||
v_a2 = vmulq_s32(v_a2, v_b2);
|
||||
vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)));
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<int, int>
|
||||
{
|
||||
int operator() ( const int * src, int * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
int32x4_t v_1 = vdupq_n_s32(1);
|
||||
|
||||
for ( ; i <= len - 4; i += 4)
|
||||
{
|
||||
int32x4_t v_b = vld1q_s32(src + i), v_a = v_1;
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
v_a = vmulq_s32(v_a, v_b);
|
||||
v_b = vmulq_s32(v_b, v_b);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a = vmulq_s32(v_a, v_b);
|
||||
vst1q_s32(dst + i, v_a);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<float, float>
|
||||
{
|
||||
int operator() ( const float * src, float * dst, int len, int power)
|
||||
{
|
||||
int i = 0;
|
||||
float32x4_t v_1 = vdupq_n_f32(1.0f);
|
||||
|
||||
for ( ; i <= len - 4; i += 4)
|
||||
{
|
||||
float32x4_t v_b = vld1q_f32(src + i), v_a = v_1;
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
v_a = vmulq_f32(v_a, v_b);
|
||||
v_b = vmulq_f32(v_b, v_b);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a = vmulq_f32(v_a, v_b);
|
||||
vst1q_f32(dst + i, v_a);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, typename WT>
|
||||
static void
|
||||
iPow_( const T* src, T* dst, int len, int power )
|
||||
{
|
||||
int i;
|
||||
for( i = 0; i < len; i++ )
|
||||
iPow_SIMD<T, WT> vop;
|
||||
int i = vop(src, dst, len, power);
|
||||
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
WT a = 1, b = src[i];
|
||||
int p = power;
|
||||
|
@ -69,7 +69,7 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
|
||||
if( simd )
|
||||
{
|
||||
__m128 half = _mm_set1_ps(0.5f);
|
||||
for( ; j <= size.width - 5; j += 4 )
|
||||
for( ; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
__m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
|
||||
__m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
|
||||
@ -90,6 +90,19 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
|
||||
_mm_storeu_ps(dst + j, a);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t v_half = vdupq_n_f32(0.5f);
|
||||
for( ; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
float32x4x3_t v_src = vld3q_f32(cov + j * 3);
|
||||
float32x4_t v_a = vmulq_f32(v_src.val[0], v_half);
|
||||
float32x4_t v_b = v_src.val[1];
|
||||
float32x4_t v_c = vmulq_f32(v_src.val[2], v_half);
|
||||
|
||||
float32x4_t v_t = vsubq_f32(v_a, v_c);
|
||||
v_t = vmlaq_f32(vmulq_f32(v_t, v_t), v_b, v_b);
|
||||
vst1q_f32(dst + j, vsubq_f32(vaddq_f32(v_a, v_c), cv_vsqrtq_f32(v_t)));
|
||||
}
|
||||
#endif
|
||||
for( ; j < size.width; j++ )
|
||||
{
|
||||
@ -290,8 +303,24 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
|
||||
float* cov_data = cov.ptr<float>(i);
|
||||
const float* dxdata = Dx.ptr<float>(i);
|
||||
const float* dydata = Dy.ptr<float>(i);
|
||||
j = 0;
|
||||
|
||||
for( j = 0; j < size.width; j++ )
|
||||
#if CV_NEON
|
||||
for( ; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
float32x4_t v_dx = vld1q_f32(dxdata + j);
|
||||
float32x4_t v_dy = vld1q_f32(dydata + j);
|
||||
|
||||
float32x4x3_t v_dst;
|
||||
v_dst.val[0] = vmulq_f32(v_dx, v_dx);
|
||||
v_dst.val[1] = vmulq_f32(v_dx, v_dy);
|
||||
v_dst.val[2] = vmulq_f32(v_dy, v_dy);
|
||||
|
||||
vst3q_f32(cov_data + j * 3, v_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; j < size.width; j++ )
|
||||
{
|
||||
float dx = dxdata[j];
|
||||
float dy = dydata[j];
|
||||
|
@ -2316,7 +2316,16 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
|
||||
}
|
||||
else if( method == CV_COMP_INTERSECT )
|
||||
{
|
||||
for( j = 0; j < len; j++ )
|
||||
j = 0;
|
||||
#if CV_NEON
|
||||
float32x4_t v_result = vdupq_n_f32(0.0f);
|
||||
for( ; j <= len - 4; j += 4 )
|
||||
v_result = vaddq_f32(v_result, vminq_f32(vld1q_f32(h1 + j), vld1q_f32(h2 + j)));
|
||||
float CV_DECL_ALIGNED(16) ar[4];
|
||||
vst1q_f32(ar, v_result);
|
||||
result += ar[0] + ar[1] + ar[2] + ar[3];
|
||||
#endif
|
||||
for( ; j < len; j++ )
|
||||
result += std::min(h1[j], h2[j]);
|
||||
}
|
||||
else if( method == CV_COMP_BHATTACHARYYA )
|
||||
|
@ -133,7 +133,7 @@ static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
|
||||
static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
|
||||
static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_SSE2 || CV_NEON
|
||||
static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
|
||||
static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
|
||||
#endif
|
||||
@ -269,7 +269,7 @@ static const void* initInterTab2D( int method, bool fixpt )
|
||||
}
|
||||
tab -= INTER_TAB_SIZE2*ksize*ksize;
|
||||
itab -= INTER_TAB_SIZE2*ksize*ksize;
|
||||
#if CV_SSE2
|
||||
#if CV_SSE2 || CV_NEON
|
||||
if( method == INTER_LINEAR )
|
||||
{
|
||||
for( i = 0; i < INTER_TAB_SIZE2; i++ )
|
||||
@ -894,9 +894,51 @@ struct VResizeCubicVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f;
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
typedef VResizeNoVec VResizeLinearVec_32s8u;
|
||||
struct VResizeLinearVec_32s8u
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
|
||||
const short* beta = (const short*)_beta;
|
||||
int x = 0;
|
||||
int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
|
||||
|
||||
for( ; x <= width - 16; x += 16)
|
||||
{
|
||||
int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
|
||||
int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
|
||||
|
||||
int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
|
||||
int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
|
||||
|
||||
int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
|
||||
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
|
||||
v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
|
||||
|
||||
v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
|
||||
v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
|
||||
v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
|
||||
v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
|
||||
|
||||
v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
|
||||
v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
|
||||
|
||||
int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
|
||||
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
|
||||
v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
|
||||
|
||||
vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLinearVec_32f16u
|
||||
{
|
||||
@ -1071,6 +1113,128 @@ struct VResizeCubicVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLanczos4Vec_32f16u
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
|
||||
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
|
||||
ushort * dst = (ushort*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
|
||||
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
|
||||
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
|
||||
v_b5, vld1q_f32(S5 + x)),
|
||||
v_b6, vld1q_f32(S6 + x)),
|
||||
v_b7, vld1q_f32(S7 + x));
|
||||
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
|
||||
|
||||
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
|
||||
v_b1, vld1q_f32(S1 + x + 4)),
|
||||
v_b2, vld1q_f32(S2 + x + 4)),
|
||||
v_b3, vld1q_f32(S3 + x + 4));
|
||||
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
|
||||
v_b5, vld1q_f32(S5 + x + 4)),
|
||||
v_b6, vld1q_f32(S6 + x + 4)),
|
||||
v_b7, vld1q_f32(S7 + x + 4));
|
||||
v_dst1 = vaddq_f32(v_dst0, v_dst1);
|
||||
|
||||
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
|
||||
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLanczos4Vec_32f16s
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
|
||||
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
|
||||
short * dst = (short*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
|
||||
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
|
||||
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
|
||||
v_b5, vld1q_f32(S5 + x)),
|
||||
v_b6, vld1q_f32(S6 + x)),
|
||||
v_b7, vld1q_f32(S7 + x));
|
||||
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
|
||||
|
||||
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
|
||||
v_b1, vld1q_f32(S1 + x + 4)),
|
||||
v_b2, vld1q_f32(S2 + x + 4)),
|
||||
v_b3, vld1q_f32(S3 + x + 4));
|
||||
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
|
||||
v_b5, vld1q_f32(S5 + x + 4)),
|
||||
v_b6, vld1q_f32(S6 + x + 4)),
|
||||
v_b7, vld1q_f32(S7 + x + 4));
|
||||
v_dst1 = vaddq_f32(v_dst0, v_dst1);
|
||||
|
||||
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLanczos4Vec_32f
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
|
||||
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
|
||||
float* dst = (float*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
|
||||
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
|
||||
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
|
||||
v_b5, vld1q_f32(S5 + x)),
|
||||
v_b6, vld1q_f32(S6 + x)),
|
||||
v_b7, vld1q_f32(S7 + x));
|
||||
vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
typedef VResizeNoVec VResizeLinearVec_32s8u;
|
||||
@ -1083,6 +1247,10 @@ typedef VResizeNoVec VResizeCubicVec_32f16u;
|
||||
typedef VResizeNoVec VResizeCubicVec_32f16s;
|
||||
typedef VResizeNoVec VResizeCubicVec_32f;
|
||||
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
|
||||
typedef VResizeNoVec VResizeLanczos4Vec_32f;
|
||||
|
||||
#endif
|
||||
|
||||
typedef HResizeNoVec HResizeLinearVec_8u32s;
|
||||
@ -1611,6 +1779,107 @@ private:
|
||||
int cn, step;
|
||||
};
|
||||
|
||||
class ResizeAreaFastVec_SIMD_16s
|
||||
{
|
||||
public:
|
||||
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
}
|
||||
|
||||
int operator() (const short * S, short * D, int w) const
|
||||
{
|
||||
int dx = 0;
|
||||
const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
|
||||
|
||||
int32x4_t v_2 = vdupq_n_s32(2);
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
|
||||
{
|
||||
int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
|
||||
|
||||
int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
|
||||
v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
|
||||
v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
|
||||
|
||||
int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
|
||||
v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
|
||||
v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
|
||||
|
||||
vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
|
||||
}
|
||||
}
|
||||
else if (cn == 4)
|
||||
{
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
|
||||
int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
|
||||
vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
|
||||
vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int cn, step;
|
||||
};
|
||||
|
||||
struct ResizeAreaFastVec_SIMD_32f
|
||||
{
|
||||
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
|
||||
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
|
||||
{
|
||||
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
|
||||
}
|
||||
|
||||
int operator() (const float * S, float * D, int w) const
|
||||
{
|
||||
if (!fast_mode)
|
||||
return 0;
|
||||
|
||||
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
|
||||
int dx = 0;
|
||||
|
||||
float32x4_t v_025 = vdupq_n_f32(0.25f);
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
|
||||
|
||||
float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
|
||||
float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
|
||||
|
||||
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
|
||||
}
|
||||
}
|
||||
else if (cn == 4)
|
||||
{
|
||||
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
|
||||
float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
|
||||
|
||||
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int scale_x, scale_y;
|
||||
int cn;
|
||||
bool fast_mode;
|
||||
int step;
|
||||
};
|
||||
|
||||
#elif CV_SSE2
|
||||
|
||||
class ResizeAreaFastVec_SIMD_8u
|
||||
@ -1800,9 +2069,16 @@ private:
|
||||
bool use_simd;
|
||||
};
|
||||
|
||||
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
|
||||
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
|
||||
|
||||
#else
|
||||
|
||||
typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
|
||||
typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
|
||||
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
|
||||
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, typename SIMDVecOp>
|
||||
@ -2626,14 +2902,14 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
0,
|
||||
resizeGeneric_<HResizeLanczos4<ushort, float, float>,
|
||||
VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
|
||||
VResizeNoVec> >,
|
||||
VResizeLanczos4Vec_32f16u> >,
|
||||
resizeGeneric_<HResizeLanczos4<short, float, float>,
|
||||
VResizeLanczos4<short, float, float, Cast<float, short>,
|
||||
VResizeNoVec> >,
|
||||
VResizeLanczos4Vec_32f16s> >,
|
||||
0,
|
||||
resizeGeneric_<HResizeLanczos4<float, float, float>,
|
||||
VResizeLanczos4<float, float, float, Cast<float, float>,
|
||||
VResizeNoVec> >,
|
||||
VResizeLanczos4Vec_32f> >,
|
||||
resizeGeneric_<HResizeLanczos4<double, double, float>,
|
||||
VResizeLanczos4<double, double, float, Cast<double, double>,
|
||||
VResizeNoVec> >,
|
||||
@ -2645,9 +2921,9 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
|
||||
0,
|
||||
resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
|
||||
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
|
||||
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
|
||||
0,
|
||||
resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
|
||||
resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
|
||||
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
|
||||
0
|
||||
};
|
||||
@ -4281,17 +4557,59 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
float* dst2f = dstmap2.ptr<float>(y);
|
||||
short* dst1 = (short*)dst1f;
|
||||
ushort* dst2 = (ushort*)dst2f;
|
||||
x = 0;
|
||||
|
||||
if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
|
||||
{
|
||||
if( nninterpolate )
|
||||
for( x = 0; x < size.width; x++ )
|
||||
{
|
||||
#if CV_NEON
|
||||
for( ; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
int16x8x2_t v_dst;
|
||||
v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
|
||||
v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
|
||||
|
||||
vst2q_s16(dst1 + (x << 1), v_dst);
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
dst1[x*2] = saturate_cast<short>(src1f[x]);
|
||||
dst1[x*2+1] = saturate_cast<short>(src2f[x]);
|
||||
}
|
||||
}
|
||||
else
|
||||
for( x = 0; x < size.width; x++ )
|
||||
{
|
||||
#if CV_NEON
|
||||
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
|
||||
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
|
||||
|
||||
for( ; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
|
||||
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
|
||||
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
|
||||
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
|
||||
|
||||
int16x8x2_t v_dst;
|
||||
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
|
||||
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
|
||||
|
||||
vst2q_s16(dst1 + (x << 1), v_dst);
|
||||
|
||||
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
|
||||
vandq_s32(v_ix0, v_mask)));
|
||||
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
|
||||
vandq_s32(v_ix1, v_mask)));
|
||||
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
|
||||
int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
|
||||
@ -4299,17 +4617,53 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
|
||||
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
|
||||
{
|
||||
if( nninterpolate )
|
||||
for( x = 0; x < size.width; x++ )
|
||||
{
|
||||
#if CV_NEON
|
||||
for( ; x <= (size.width << 1) - 8; x += 8 )
|
||||
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
dst1[x*2] = saturate_cast<short>(src1f[x*2]);
|
||||
dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
|
||||
}
|
||||
}
|
||||
else
|
||||
for( x = 0; x < size.width; x++ )
|
||||
{
|
||||
#if CV_NEON
|
||||
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
|
||||
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
|
||||
|
||||
for( ; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
|
||||
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
|
||||
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
|
||||
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
|
||||
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
|
||||
|
||||
int16x8x2_t v_dst;
|
||||
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
|
||||
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
|
||||
|
||||
vst2q_s16(dst1 + (x << 1), v_dst);
|
||||
|
||||
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
|
||||
vandq_s32(v_ix0, v_mask)));
|
||||
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
|
||||
vandq_s32(v_ix1, v_mask)));
|
||||
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
|
||||
int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
|
||||
@ -4317,10 +4671,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
|
||||
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
|
||||
{
|
||||
for( x = 0; x < size.width; x++ )
|
||||
#if CV_NEON
|
||||
uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
|
||||
uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
|
||||
float32x4_t v_scale = vdupq_n_f32(scale);
|
||||
|
||||
for( ; x <= size.width - 8; x += 8)
|
||||
{
|
||||
uint32x4_t v_fxy1, v_fxy2;
|
||||
if (src2)
|
||||
{
|
||||
uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
|
||||
v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
|
||||
v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
|
||||
}
|
||||
else
|
||||
v_fxy1 = v_fxy2 = v_zero;
|
||||
|
||||
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
|
||||
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
|
||||
float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
|
||||
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
|
||||
vst1q_f32(dst1f + x, v_dst1);
|
||||
vst1q_f32(dst2f + x, v_dst2);
|
||||
|
||||
v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
|
||||
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
|
||||
v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
|
||||
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
|
||||
vst1q_f32(dst1f + x + 4, v_dst1);
|
||||
vst1q_f32(dst2f + x + 4, v_dst2);
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
|
||||
dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
|
||||
@ -4329,7 +4717,39 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
}
|
||||
else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
|
||||
{
|
||||
for( x = 0; x < size.width; x++ )
|
||||
#if CV_NEON
|
||||
int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
|
||||
int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
|
||||
float32x4_t v_scale = vdupq_n_f32(scale);
|
||||
|
||||
for( ; x <= size.width - 8; x += 8)
|
||||
{
|
||||
int32x4_t v_fxy1, v_fxy2;
|
||||
if (src2)
|
||||
{
|
||||
int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
|
||||
v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
|
||||
v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
|
||||
}
|
||||
else
|
||||
v_fxy1 = v_fxy2 = v_zero;
|
||||
|
||||
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
|
||||
float32x4x2_t v_dst;
|
||||
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
|
||||
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
|
||||
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
|
||||
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
|
||||
vst2q_f32(dst1f + (x << 1), v_dst);
|
||||
|
||||
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
|
||||
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
|
||||
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
|
||||
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
|
||||
vst2q_f32(dst1f + (x << 1) + 8, v_dst);
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
|
||||
dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
|
||||
@ -4389,13 +4809,29 @@ public:
|
||||
int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
|
||||
|
||||
if( interpolation == INTER_NEAREST )
|
||||
for( x1 = 0; x1 < bw; x1++ )
|
||||
{
|
||||
x1 = 0;
|
||||
#if CV_NEON
|
||||
int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
|
||||
for( ; x1 <= bw - 8; x1 += 8 )
|
||||
{
|
||||
int16x8x2_t v_dst;
|
||||
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
|
||||
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
|
||||
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
|
||||
|
||||
vst2q_s16(xy + (x1 << 1), v_dst);
|
||||
}
|
||||
#endif
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
int X = (X0 + adelta[x+x1]) >> AB_BITS;
|
||||
int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
|
||||
xy[x1*2] = saturate_cast<short>(X);
|
||||
xy[x1*2+1] = saturate_cast<short>(Y);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
short* alpha = A + y1*bw;
|
||||
@ -4433,6 +4869,27 @@ public:
|
||||
_mm_storeu_si128((__m128i*)(alpha + x1), fx_);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
|
||||
for( ; x1 <= bw - 8; x1 += 8 )
|
||||
{
|
||||
int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
|
||||
int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
|
||||
int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
|
||||
int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
|
||||
|
||||
int16x8x2_t v_xy;
|
||||
v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
|
||||
v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
|
||||
|
||||
vst2q_s16(xy + (x1 << 1), v_xy);
|
||||
|
||||
int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
|
||||
vandq_s32(v_X0, v_mask)));
|
||||
int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
|
||||
vandq_s32(v_X1, v_mask)));
|
||||
vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
|
||||
}
|
||||
#endif
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
|
@ -60,11 +60,16 @@ template<typename T, int shift> struct FltCast
|
||||
rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2> struct NoVec
|
||||
template<typename T1, typename T2> struct PyrDownNoVec
|
||||
{
|
||||
int operator()(T1**, T2*, int, int) const { return 0; }
|
||||
};
|
||||
|
||||
template<typename T1, typename T2> struct PyrUpNoVec
|
||||
{
|
||||
int operator()(T1**, T2**, int, int) const { return 0; }
|
||||
};
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
struct PyrDownVec_32s8u
|
||||
@ -178,10 +183,13 @@ struct PyrDownVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
typedef NoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef NoVec<int, short> PyrDownVec_32s16s;
|
||||
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
|
||||
|
||||
typedef NoVec<float, float> PyrUpVec_32f;
|
||||
typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
|
||||
typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
|
||||
typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
|
||||
typedef PyrUpNoVec<float, float> PyrUpVec_32f;
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
@ -203,9 +211,9 @@ struct PyrDownVec_32s8u
|
||||
uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
|
||||
uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
|
||||
|
||||
v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
|
||||
v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
|
||||
uint16x8_t v_dst0 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
|
||||
v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
|
||||
v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
|
||||
uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
|
||||
|
||||
v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
|
||||
v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
|
||||
@ -213,9 +221,9 @@ struct PyrDownVec_32s8u
|
||||
v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
|
||||
v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
|
||||
|
||||
v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
|
||||
v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
|
||||
uint16x8_t v_dst1 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
|
||||
v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
|
||||
v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
|
||||
uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
|
||||
|
||||
vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
|
||||
vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
|
||||
@ -240,18 +248,17 @@ struct PyrDownVec_32s16u
|
||||
int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
|
||||
int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
|
||||
int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
|
||||
int32x4_t shifted;
|
||||
|
||||
v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
|
||||
v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
|
||||
v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
|
||||
v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
|
||||
|
||||
shifted = vshlq_n_s32(v_r10, 2);
|
||||
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, shifted), v_delta), 8);
|
||||
v_r10 = vshlq_n_s32(v_r10, 2);
|
||||
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
|
||||
v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
|
||||
shifted = vshlq_n_s32(v_r11, 2);
|
||||
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, shifted), v_delta), 8);
|
||||
v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
|
||||
v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
|
||||
v_r11 = vshlq_n_s32(v_r11, 2);
|
||||
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
|
||||
}
|
||||
@ -275,17 +282,16 @@ struct PyrDownVec_32s16s
|
||||
int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
|
||||
int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
|
||||
int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
|
||||
int32x4_t shifted;
|
||||
|
||||
v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
|
||||
v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
|
||||
shifted = vshlq_n_s32(v_r10, 2);
|
||||
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, shifted), v_delta), 8);
|
||||
v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
|
||||
v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
|
||||
v_r10 = vshlq_n_s32(v_r10, 2);
|
||||
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
|
||||
v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
|
||||
shifted = vshlq_n_s32(v_r11, 2);
|
||||
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, shifted), v_delta), 8);
|
||||
v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
|
||||
v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
|
||||
v_r11 = vshlq_n_s32(v_r11, 2);
|
||||
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
|
||||
}
|
||||
@ -329,14 +335,156 @@ struct PyrDownVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32f
|
||||
struct PyrUpVec_32s8u
|
||||
{
|
||||
int operator()(float** src, float* dst, int, int width) const
|
||||
int operator()(int** src, uchar** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
uchar *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
uint16x8_t v_delta = vdupq_n_u16(32);
|
||||
|
||||
for( ; x <= width - 16; x += 16 )
|
||||
{
|
||||
uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
|
||||
uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
|
||||
uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
|
||||
|
||||
uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
|
||||
uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
|
||||
uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
|
||||
v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
|
||||
v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
|
||||
|
||||
v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
|
||||
uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
|
||||
uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
|
||||
|
||||
vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)),
|
||||
vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6))));
|
||||
vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)),
|
||||
vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6))));
|
||||
}
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
|
||||
uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
|
||||
uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
|
||||
|
||||
uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
|
||||
uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
|
||||
uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
|
||||
|
||||
vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6)));
|
||||
vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32s16u
|
||||
{
|
||||
int operator()(int** src, ushort** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
ushort *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
uint32x4_t v_delta = vdupq_n_u32(32);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
|
||||
uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
|
||||
uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
|
||||
uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = vld1q_u32(row0 + x + 4);
|
||||
v_r1 = vld1q_u32(row1 + x + 4);
|
||||
v_r2 = vld1q_u32(row2 + x + 4);
|
||||
v_2r1 = vshlq_n_u32(v_r1, 1);
|
||||
v_4r1 = vshlq_n_u32(v_r1, 2);
|
||||
uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
|
||||
uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
|
||||
|
||||
vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)),
|
||||
vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6))));
|
||||
vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)),
|
||||
vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6))));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
|
||||
uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
|
||||
|
||||
uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
|
||||
uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
|
||||
|
||||
vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6)));
|
||||
vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32s16s
|
||||
{
|
||||
int operator()(int** src, short** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
short *dst0 = dst[0], *dst1 = dst[1];
|
||||
const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
|
||||
int32x4_t v_delta = vdupq_n_s32(32);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
|
||||
int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
|
||||
int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
|
||||
int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = vld1q_s32(row0 + x + 4);
|
||||
v_r1 = vld1q_s32(row1 + x + 4);
|
||||
v_r2 = vld1q_s32(row2 + x + 4);
|
||||
v_2r1 = vshlq_n_s32(v_r1, 1);
|
||||
v_4r1 = vshlq_n_s32(v_r1, 2);
|
||||
int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
|
||||
int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
|
||||
|
||||
vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)),
|
||||
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6))));
|
||||
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)),
|
||||
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6))));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
|
||||
int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
|
||||
|
||||
int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
|
||||
int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
|
||||
|
||||
vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6)));
|
||||
vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32f
|
||||
{
|
||||
int operator()(float** src, float** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
float ** dsts = (float **)dst;
|
||||
const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
|
||||
float *dst0 = dsts[0], *dst1 = dsts[1];
|
||||
float *dst0 = dst[0], *dst1 = dst[1];
|
||||
float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
@ -362,12 +510,15 @@ struct PyrUpVec_32f
|
||||
|
||||
#else
|
||||
|
||||
typedef NoVec<int, uchar> PyrDownVec_32s8u;
|
||||
typedef NoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef NoVec<int, short> PyrDownVec_32s16s;
|
||||
typedef NoVec<float, float> PyrDownVec_32f;
|
||||
typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u;
|
||||
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
|
||||
typedef PyrDownNoVec<float, float> PyrDownVec_32f;
|
||||
|
||||
typedef NoVec<float, float> PyrUpVec_32f;
|
||||
typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
|
||||
typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
|
||||
typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
|
||||
typedef PyrUpNoVec<float, float> PyrUpVec_32f;
|
||||
|
||||
#endif
|
||||
|
||||
@ -574,7 +725,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
|
||||
row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
|
||||
dsts[0] = dst0; dsts[1] = dst1;
|
||||
|
||||
x = vecOp(rows, (T*)dsts, (int)_dst.step, dsize.width);
|
||||
x = vecOp(rows, dsts, (int)_dst.step, dsize.width);
|
||||
for( ; x < dsize.width; x++ )
|
||||
{
|
||||
T t1 = castOp((row1[x] + row2[x])*4);
|
||||
@ -761,7 +912,7 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde
|
||||
else if( depth == CV_32F )
|
||||
func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
|
||||
else if( depth == CV_64F )
|
||||
func = pyrDown_<FltCast<double, 8>, NoVec<double, double> >;
|
||||
func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >;
|
||||
else
|
||||
CV_Error( CV_StsUnsupportedFormat, "" );
|
||||
|
||||
@ -830,15 +981,15 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
|
||||
|
||||
PyrFunc func = 0;
|
||||
if( depth == CV_8U )
|
||||
func = pyrUp_<FixPtCast<uchar, 6>, NoVec<int, uchar> >;
|
||||
func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >;
|
||||
else if( depth == CV_16S )
|
||||
func = pyrUp_<FixPtCast<short, 6>, NoVec<int, short> >;
|
||||
func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >;
|
||||
else if( depth == CV_16U )
|
||||
func = pyrUp_<FixPtCast<ushort, 6>, NoVec<int, ushort> >;
|
||||
func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >;
|
||||
else if( depth == CV_32F )
|
||||
func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
|
||||
else if( depth == CV_64F )
|
||||
func = pyrUp_<FltCast<double, 6>, NoVec<double, double> >;
|
||||
func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >;
|
||||
else
|
||||
CV_Error( CV_StsUnsupportedFormat, "" );
|
||||
|
||||
|
@ -1100,7 +1100,7 @@ int CV_CompareHistTest::validate_test_results( int /*test_case_idx*/ )
|
||||
code = cvtest::TS::FAIL_INVALID_OUTPUT;
|
||||
break;
|
||||
}
|
||||
else if( fabs(v0 - v) > FLT_EPSILON*10*MAX(fabs(v0),0.1) )
|
||||
else if( fabs(v0 - v) > FLT_EPSILON*14*MAX(fabs(v0),0.1) )
|
||||
{
|
||||
ts->printf( cvtest::TS::LOG, "The comparison result using the method #%d (%s)\n\tis inaccurate (=%g, should be =%g)\n",
|
||||
i, method_name, v, v0 );
|
||||
|
@ -1548,9 +1548,28 @@ TEST(Imgproc_GetQuadSubPix, accuracy) { CV_GetQuadSubPixTest test; test.safe_run
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct IntCast
|
||||
{
|
||||
T operator() (WT val) const
|
||||
{
|
||||
return cv::saturate_cast<T>(val >> 2);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct FltCast
|
||||
{
|
||||
T operator() (WT val) const
|
||||
{
|
||||
return cv::saturate_cast<T>(val * 0.25);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename WT, int one, typename CastOp>
|
||||
void resizeArea(const cv::Mat & src, cv::Mat & dst)
|
||||
{
|
||||
int cn = src.channels();
|
||||
CastOp castOp;
|
||||
|
||||
for (int y = 0; y < dst.rows; ++y)
|
||||
{
|
||||
@ -1565,9 +1584,9 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
|
||||
for (int c = 0; c < cn; ++c)
|
||||
{
|
||||
WT sum = WT(sptr0[x1 + c]) + WT(sptr0[x1 + c + cn]);
|
||||
sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(2);
|
||||
sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(one);
|
||||
|
||||
dptr[x + c] = cv::saturate_cast<T>(sum >> 2);
|
||||
dptr[x + c] = castOp(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1575,32 +1594,38 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
|
||||
|
||||
TEST(Resize, Area_half)
|
||||
{
|
||||
const int size = 10;
|
||||
int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4 };
|
||||
const int size = 1000;
|
||||
int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 };
|
||||
|
||||
cv::RNG rng(17);
|
||||
|
||||
for (int i = 0, _size = sizeof(types) / sizeof(types[0]); i < _size; ++i)
|
||||
{
|
||||
int type = types[i], depth = CV_MAT_DEPTH(type);
|
||||
int type = types[i], depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
const float eps = depth <= CV_32S ? 0 : 7e-5f;
|
||||
|
||||
SCOPED_TRACE(depth);
|
||||
SCOPED_TRACE(cn);
|
||||
|
||||
cv::Mat src(size, size, type), dst_actual(size >> 1, size >> 1, type),
|
||||
dst_reference(size >> 1, size >> 1, type);
|
||||
|
||||
rng.fill(src, cv::RNG::UNIFORM, 0, 1000, true);
|
||||
rng.fill(src, cv::RNG::UNIFORM, -1000, 1000, true);
|
||||
|
||||
if (depth == CV_8U)
|
||||
resizeArea<uchar, ushort>(src, dst_reference);
|
||||
resizeArea<uchar, ushort, 2, IntCast<uchar, ushort> >(src, dst_reference);
|
||||
else if (depth == CV_16U)
|
||||
resizeArea<ushort, int>(src, dst_reference);
|
||||
resizeArea<ushort, uint, 2, IntCast<ushort, uint> >(src, dst_reference);
|
||||
else if (depth == CV_16S)
|
||||
resizeArea<short, int, 2, IntCast<short, int> >(src, dst_reference);
|
||||
else if (depth == CV_32F)
|
||||
resizeArea<float, float, 0, FltCast<float, float> >(src, dst_reference);
|
||||
else
|
||||
CV_Assert(0);
|
||||
|
||||
cv::resize(src, dst_actual, dst_actual.size(), 0, 0, cv::INTER_AREA);
|
||||
|
||||
ASSERT_EQ(0, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF));
|
||||
ASSERT_GE(eps, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user