SymmColumnVec_32f16s asymm

NEON speedup: 9.46x
Auto-vect speedup: 1x

Test kernel: [-0.9432, -1.1528, 0, 1.1528, 0.9432]
This commit is contained in:
orestis 2014-12-19 22:44:39 +02:00
parent a2a131799f
commit ed0ce48179

View File

@ -2873,7 +2873,53 @@ struct SymmColumnVec_32f16s
}
else
{
return 0;
float32x2_t k32;
k32 = vdup_n_f32(0);
k32 = vld1_lane_f32(ky + 1, k32, 1);
for( ; i <= width - 8; i += 8 )
{
float32x4_t x1l, x1h, x2l, x2h;
float32x4_t accl, acch;
S = src[1] + i;
S2 = src[-1] + i;
x1l = vld1q_f32(S);
x1h = vld1q_f32(S + 4);
x2l = vld1q_f32(S2);
x2h = vld1q_f32(S2 + 4);
accl = acch = d4;
accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
for( k = 2; k <= ksize2; k++ )
{
S = src[k] + i;
S2 = src[-k] + i;
float32x4_t x3l, x3h, x4l, x4h;
x3l = vld1q_f32(S);
x3h = vld1q_f32(S + 4);
x4l = vld1q_f32(S2);
x4h = vld1q_f32(S2 + 4);
accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
}
int32x4_t s32l, s32h;
s32l = vcvtq_s32_f32(accl);
s32h = vcvtq_s32_f32(acch);
int16x4_t s16l, s16h;
s16l = vqmovn_s32(s32l);
s16h = vqmovn_s32(s32h);
vst1_s16((int16_t *)(dst + i), s16l);
vst1_s16((int16_t *)(dst + i + 4), s16h);
}
}
return i;