Merge pull request #3041 from adrians:optimize_split

This commit is contained in:
Vadim Pisarevsky 2014-08-06 09:22:38 +00:00
commit d611684d9e

View File

@ -50,6 +50,71 @@ namespace cv
* split & merge *
\****************************************************************************************/
#if CV_NEON
template<typename T> struct VSplit2;
template<typename T> struct VSplit3;
template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
} \
}
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2){ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
} \
}
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3){ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
store_func(dst3, r.val[3]); \
} \
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#endif
template<typename T> static void
split_( const T* src, T** dst, int len, int cn )
{
@ -58,13 +123,34 @@ split_( const T* src, T** dst, int len, int cn )
if( k == 1 )
{
T* dst0 = dst[0];
for( i = j = 0; i < len; i++, j += cn )
dst0[i] = src[j];
if(cn == 1)
{
memcpy(dst0, src, len * sizeof(T));
}
else
{
for( i = 0, j = 0 ; i < len; i++, j += cn )
dst0[i] = src[j];
}
}
else if( k == 2 )
{
T *dst0 = dst[0], *dst1 = dst[1];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
@ -73,7 +159,20 @@ split_( const T* src, T** dst, int len, int cn )
else if( k == 3 )
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
@ -83,7 +182,20 @@ split_( const T* src, T** dst, int len, int cn )
else
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
@ -101,6 +213,77 @@ split_( const T* src, T** dst, int len, int cn )
}
}
#if CV_NEON
template<typename T> struct VMerge2;
template<typename T> struct VMerge3;
template<typename T> struct VMerge4;
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
store_func(dst, r); \
} \
}
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
store_func(dst, r); \
} \
}
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, const data_type* src3, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
r.val[3] = load_func(src3); \
store_func(dst, r); \
} \
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#endif
template<typename T> static void
merge_( const T** src, T* dst, int len, int cn )
{
@ -115,7 +298,19 @@ merge_( const T** src, T* dst, int len, int cn )
else if( k == 2 )
{
const T *src0 = src[0], *src1 = src[1];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
@ -124,7 +319,19 @@ merge_( const T** src, T* dst, int len, int cn )
else if( k == 3 )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
@ -134,7 +341,19 @@ merge_( const T** src, T* dst, int len, int cn )
else
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for( i = j = 0; i < len; i++, j += cn )
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];