mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
SymmColumnVec_32f16s
NEON speedup: 8.64x Auto-vect speedup: 1x Test kernel: [0.1, 0.2408, 0.3184, 0.2408, 0.1]
This commit is contained in:
parent
37e018454d
commit
a2a131799f
@ -2779,11 +2779,117 @@ struct SymmColumnSmallVec_32s16s
|
||||
};
|
||||
|
||||
|
||||
struct SymmColumnVec_32f16s
|
||||
{
|
||||
SymmColumnVec_32f16s() { symmetryType=0; }
|
||||
SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
|
||||
{
|
||||
symmetryType = _symmetryType;
|
||||
kernel = _kernel;
|
||||
delta = (float)_delta;
|
||||
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
|
||||
//Uncomment the following line when runtime support for neon is implemented.
|
||||
// neon_supported = checkHardwareSupport(CV_CPU_NEON);
|
||||
}
|
||||
|
||||
int operator()(const uchar** _src, uchar* _dst, int width) const
|
||||
{
|
||||
//Uncomment the two following lines when runtime support for neon is implemented.
|
||||
// if( !neon_supported )
|
||||
// return 0;
|
||||
|
||||
int _ksize = kernel.rows + kernel.cols - 1;
|
||||
int ksize2 = _ksize / 2;
|
||||
const float* ky = kernel.ptr<float>() + ksize2;
|
||||
int i = 0, k;
|
||||
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
|
||||
const float** src = (const float**)_src;
|
||||
const float *S, *S2;
|
||||
short* dst = (short*)_dst;
|
||||
|
||||
float32x4_t d4 = vdupq_n_f32(delta);
|
||||
|
||||
if( symmetrical )
|
||||
{
|
||||
if( _ksize == 1 )
|
||||
return 0;
|
||||
|
||||
|
||||
float32x2_t k32;
|
||||
k32 = vdup_n_f32(0);
|
||||
k32 = vld1_lane_f32(ky, k32, 0);
|
||||
k32 = vld1_lane_f32(ky + 1, k32, 1);
|
||||
|
||||
for( ; i <= width - 8; i += 8 )
|
||||
{
|
||||
float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
|
||||
float32x4_t accl, acch;
|
||||
|
||||
S = src[0] + i;
|
||||
|
||||
x0l = vld1q_f32(S);
|
||||
x0h = vld1q_f32(S + 4);
|
||||
|
||||
S = src[1] + i;
|
||||
S2 = src[-1] + i;
|
||||
|
||||
x1l = vld1q_f32(S);
|
||||
x1h = vld1q_f32(S + 4);
|
||||
x2l = vld1q_f32(S2);
|
||||
x2h = vld1q_f32(S2 + 4);
|
||||
|
||||
accl = acch = d4;
|
||||
accl = vmlaq_lane_f32(accl, x0l, k32, 0);
|
||||
acch = vmlaq_lane_f32(acch, x0h, k32, 0);
|
||||
accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
|
||||
acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
|
||||
|
||||
for( k = 2; k <= ksize2; k++ )
|
||||
{
|
||||
S = src[k] + i;
|
||||
S2 = src[-k] + i;
|
||||
|
||||
float32x4_t x3l, x3h, x4l, x4h;
|
||||
x3l = vld1q_f32(S);
|
||||
x3h = vld1q_f32(S + 4);
|
||||
x4l = vld1q_f32(S2);
|
||||
x4h = vld1q_f32(S2 + 4);
|
||||
|
||||
accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
|
||||
acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
|
||||
}
|
||||
|
||||
int32x4_t s32l, s32h;
|
||||
s32l = vcvtq_s32_f32(accl);
|
||||
s32h = vcvtq_s32_f32(acch);
|
||||
|
||||
int16x4_t s16l, s16h;
|
||||
s16l = vqmovn_s32(s32l);
|
||||
s16h = vqmovn_s32(s32h);
|
||||
|
||||
vst1_s16((int16_t *)(dst + i), s16l);
|
||||
vst1_s16((int16_t *)(dst + i + 4), s16h);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
int symmetryType;
|
||||
float delta;
|
||||
Mat kernel;
|
||||
bool neon_supported;
|
||||
};
|
||||
|
||||
|
||||
typedef RowNoVec RowVec_8u32s;
|
||||
typedef RowNoVec RowVec_16s32f;
|
||||
typedef RowNoVec RowVec_32f;
|
||||
typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
|
||||
typedef ColumnNoVec SymmColumnVec_32f16s;
|
||||
typedef ColumnNoVec SymmColumnVec_32f;
|
||||
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
|
||||
typedef FilterNoVec FilterVec_8u;
|
||||
|
Loading…
Reference in New Issue
Block a user