Fixed OOB reading in pyrDown

This commit is contained in:
Vitaly Tuzov 2019-09-24 18:41:39 +03:00
parent c2096771cb
commit 1c17b3281a

View File

@ -87,13 +87,13 @@ template<typename T1, typename T2> int PyrUpVecV(T1**, T2**, int) { return 0; }
template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
{
int x = 0;
const uchar *src0 = src, *src2 = src + 2, *src4 = src + 3;
const uchar *src01 = src, *src23 = src + 2, *src4 = src + 3;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src0)), v_1_4) +
v_dotprod(v_reinterpret_as_s16(vx_load_expand(src2)), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4) +
v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4) +
(v_reinterpret_as_s32(vx_load_expand(src4)) >> 16));
vx_cleanup();
@ -102,13 +102,13 @@ template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
template<> int PyrDownVecH<uchar, int, 2>(const uchar* src, int* row, int width)
{
int x = 0;
const uchar *src0 = src, *src2 = src + 4, *src4 = src + 6;
const uchar *src01 = src, *src23 = src + 4, *src4 = src + 6;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) +
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
(v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16));
vx_cleanup();
@ -150,13 +150,13 @@ template<> int PyrDownVecH<uchar, int, 3>(const uchar* src, int* row, int width)
template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width)
{
int x = 0;
const uchar *src0 = src, *src2 = src + 8, *src4 = src + 12;
const uchar *src01 = src, *src23 = src + 8, *src4 = src + 12;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) +
v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
(v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16));
vx_cleanup();
@ -166,13 +166,13 @@ template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width)
template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width)
{
int x = 0;
const short *src0 = src, *src2 = src + 2, *src4 = src + 3;
const short *src01 = src, *src23 = src + 2, *src4 = src + 3;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(vx_load(src0), v_1_4) +
v_dotprod(vx_load(src2), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(vx_load(src01), v_1_4) +
v_dotprod(vx_load(src23), v_6_4) +
(v_reinterpret_as_s32(vx_load(src4)) >> 16));
vx_cleanup();
@ -181,13 +181,13 @@ template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width)
template<> int PyrDownVecH<short, int, 2>(const short* src, int* row, int width)
{
int x = 0;
const short *src0 = src, *src2 = src + 4, *src4 = src + 6;
const short *src01 = src, *src23 = src + 4, *src4 = src + 6;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(vx_load(src0)), v_1_4) +
v_dotprod(v_interleave_pairs(vx_load(src2)), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4) +
v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4) +
(v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16));
vx_cleanup();
@ -247,15 +247,15 @@ template<> int PyrDownVecH<short, int, 4>(const short* src, int* row, int width)
template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int width)
{
int x = 0;
const ushort *src0 = src, *src2 = src + 2, *src4 = src + 3;
const ushort *src01 = src, *src23 = src + 2, *src4 = src + 3;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
v_uint16 v_half = vx_setall_u16(0x8000);
v_int32 v_half15 = vx_setall_s32(0x00078000);
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half)), v_1_4) +
v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half)), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4) +
v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4) +
v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15);
vx_cleanup();
@ -264,15 +264,15 @@ template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int widt
template<> int PyrDownVecH<ushort, int, 2>(const ushort* src, int* row, int width)
{
int x = 0;
const ushort *src0 = src, *src2 = src + 4, *src4 = src + 6;
const ushort *src01 = src, *src23 = src + 4, *src4 = src + 6;
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
v_uint16 v_half = vx_setall_u16(0x8000);
v_int32 v_half15 = vx_setall_s32(0x00078000);
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half))), v_1_4) +
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half))), v_6_4) +
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4) +
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4) +
v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15);
vx_cleanup();
@ -344,15 +344,15 @@ template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int widt
template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int width)
{
int x = 0;
const float *src0 = src, *src2 = src + 2, *src4 = src + 4;
const float *src01 = src, *src23 = src + 2, *src4 = src + 3;
v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src0 += 2*v_float32::nlanes, src2 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes)
for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src01 += 2*v_float32::nlanes, src23 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes)
{
v_float32 r0, r1, r2, r3, r4, rtmp;
v_load_deinterleave(src0, r0, r1);
v_load_deinterleave(src2, r2, r3);
v_load_deinterleave(src4, r4, rtmp);
v_load_deinterleave(src01, r0, r1);
v_load_deinterleave(src23, r2, r3);
v_load_deinterleave(src4, rtmp, r4);
v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
}
vx_cleanup();
@ -362,14 +362,14 @@ template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int wi
template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int width)
{
int x = 0;
const float *src0 = src, *src2 = src + 4, *src4 = src + 6;
const float *src01 = src, *src23 = src + 4, *src4 = src + 6;
v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src0 += 4*v_float32::nlanes, src2 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes)
for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src01 += 4*v_float32::nlanes, src23 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes)
{
v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb;
v_load_deinterleave(src0, r0a, r0b, r1a, r1b);
v_load_deinterleave(src2, r2a, r2b, r3a, r3b);
v_load_deinterleave(src01, r0a, r0b, r1a, r1b);
v_load_deinterleave(src23, r2a, r2b, r3a, r3b);
v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b);
v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b)));
}
@ -430,15 +430,15 @@ template<> int PyrDownVecH<float, float, 4>(const float* src, float* row, int wi
template<> int PyrDownVecH<double, double, 1>(const double* src, double* row, int width)
{
int x = 0;
const double *src0 = src, *src2 = src + 2, *src4 = src + 4;
const double *src01 = src, *src23 = src + 2, *src4 = src + 3;
v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f);
for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src0 += 2*v_float64::nlanes, src2 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes)
for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src01 += 2*v_float64::nlanes, src23 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes)
{
v_float64 r0, r1, r2, r3, r4, rtmp;
v_load_deinterleave(src0, r0, r1);
v_load_deinterleave(src2, r2, r3);
v_load_deinterleave(src4, r4, rtmp);
v_load_deinterleave(src01, r0, r1);
v_load_deinterleave(src23, r2, r3);
v_load_deinterleave(src4, rtmp, r4);
v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
}
vx_cleanup();