Improves findContours using SSE _mm_movemask_epi8 to find next contour point. Cleaned up code a little bit

This commit is contained in:
matze 2016-09-02 20:09:13 +02:00
parent d102ea96c0
commit 8af8c4d0aa

View File

@ -50,6 +50,21 @@
static const CvPoint icvCodeDeltas[8] =
{ CvPoint(1, 0), CvPoint(1, -1), CvPoint(0, -1), CvPoint(-1, -1), CvPoint(-1, 0), CvPoint(-1, 1), CvPoint(0, 1), CvPoint(1, 1) };
inline unsigned int trailingZeros(unsigned int value) {
#if defined(_MSC_VER)
return _tzcnt_u32(value);
#elif defined(_GCC)
return __builtin_ctz(value);
#elif defined(__INTEL_COMPILER)
//return _bit_scan_reverse(value);
#else
static const int MultiplyDeBruijnBitPosition[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
#endif
}
CV_IMPL void
cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader )
{
@ -527,10 +542,8 @@ icvFetchContour( schar *ptr,
{
s = (s - 1) & 7;
i1 = i0 + deltas[s];
if( *i1 != 0 )
break;
}
while( s != s_end );
while( *i1 == 0 && s != s_end );
if( s == s_end ) /* single pixel domain */
{
@ -631,10 +644,8 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole )
{
s = (s - 1) & 7;
i1 = i0 + deltas[s];
if( *i1 != 0 )
break;
}
while( s != s_end );
while( *i1 == 0 && s != s_end );
i3 = i0;
@ -644,7 +655,6 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole )
/* follow border */
for( ;; )
{
s_end = s;
for( ;; )
{
@ -702,10 +712,8 @@ icvFetchContourEx( schar* ptr,
{
s = (s - 1) & 7;
i1 = i0 + deltas[s];
if( *i1 != 0 )
break;
}
while( s != s_end );
while( *i1 == 0 && s != s_end );
if( s == s_end ) /* single pixel domain */
{
@ -817,10 +825,8 @@ icvTraceContour_32s( int *ptr, int step, int *stop_ptr, int is_hole )
{
s = (s - 1) & 7;
i1 = i0 + deltas[s];
if( (*i1 & value_mask) == ccomp_val )
break;
}
while( s != s_end );
while( (*i1 & value_mask) != ccomp_val && s != s_end );
i3 = i0;
@ -892,10 +898,8 @@ icvFetchContourEx_32s( int* ptr,
{
s = (s - 1) & 7;
i1 = i0 + deltas[s];
if( (*i1 & value_mask) == ccomp_val )
break;
}
while( s != s_end );
while( (*i1 & value_mask) != ccomp_val && s != s_end );
if( s == s_end ) /* single pixel domain */
{
@ -990,6 +994,11 @@ cvFindNextContour( CvContourScanner scanner )
{
if( !scanner )
CV_Error( CV_StsNullPtr, "" );
#if CV_SSE2
bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2);
#endif
icvEndProcessContour( scanner );
/* initialize local state */
@ -1034,13 +1043,60 @@ cvFindNextContour( CvContourScanner scanner )
}
else
{
#if CV_SSE2
if ((p = img[x]) != prev) {
goto _next_contour;
} else if (haveSSE2) {
__m128i v_prev = _mm_set1_epi8(prev);
int v_size = width - 32;
for (; x <= v_size; x += 32) {
__m128i v_p1 = _mm_loadu_si128((__m128i*)(img + x));
__m128i v_p2 = _mm_loadu_si128((__m128i*)(img + x + 16));
__m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_prev);
__m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_prev);
unsigned int mask1 = _mm_movemask_epi8(v_cmp1);
unsigned int mask2 = _mm_movemask_epi8(v_cmp2);
mask1 ^= 0x0000ffff;
mask2 ^= 0x0000ffff;
if (mask1) {
p = img[(x += trailingZeros(mask1))];
goto _next_contour;
}
if (mask2) {
p = img[(x += trailingZeros(mask2 << 16))];
goto _next_contour;
}
}
if(x <= width - 16) {
__m128i v_p = _mm_loadu_si128((__m128i*)(img + x));
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_prev)) ^ 0x0000ffff;
if (mask) {
p = img[(x += trailingZeros(mask))];
goto _next_contour;
}
x += 16;
}
}
#endif
for( ; x < width && (p = img[x]) == prev; x++ )
;
}
if( x >= width )
break;
#if CV_SSE2
_next_contour:
#endif
{
_CvContourInfo *par_info = 0;
_CvContourInfo *l_cinfo = 0;
@ -1275,7 +1331,7 @@ cvEndFindContours( CvContourScanner * _scanner )
#define ICV_SINGLE 0
#define ICV_CONNECTING_ABOVE 1
#define ICV_CONNECTING_BELOW -1
#define ICV_IS_COMPONENT_POINT(val) ((val) != 0)
//#define ICV_IS_COMPONENT_POINT(val) ((val) != 0)
#define CV_GET_WRITTEN_ELEM( writer ) ((writer).ptr - (writer).seq->elem_size)
@ -1287,6 +1343,108 @@ typedef struct CvLinkedRunPoint
}
CvLinkedRunPoint;
#if CV_SSE2
inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) {
#else
inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) {
#endif
#if CV_SSE2
if (haveSSE2) {
__m128i sseZero = _mm_setzero_si128();
int sizeSse = img_size.width - 32;
for (; j <= sizeSse; j += 32) {
__m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j));
__m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16));
__m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero);
__m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero);
unsigned int mask1 = _mm_movemask_epi8(sseCmp1);
unsigned int mask2 = _mm_movemask_epi8(sseCmp2);
mask1 ^= 0x0000ffff;
mask2 ^= 0x0000ffff;
if (mask1) {
j += trailingZeros(mask1);
return j;
}
if (mask2) {
j += trailingZeros(mask2 << 16);
return j;
}
}
if (j <= img_size.width - 16) {
__m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j));
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero)) ^ 0x0000ffff;
if (mask) {
j += trailingZeros(mask);
return j;
}
j += 16;
}
}
#endif
for (; j < img_size.width && !src_data[j]; ++j)
;
return j;
}
#if CV_SSE2
inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) {
#else
inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) {
#endif
#if CV_SSE2
if (!src_data[j]) {
return j;
} else if (haveSSE2) {
__m128i sseZero = _mm_setzero_si128();
int sizeSse = img_size.width - 32;
for (; j <= sizeSse; j += 32) {
__m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j));
__m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16));
__m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero);
__m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero);
unsigned int mask1 = _mm_movemask_epi8(sseCmp1);
unsigned int mask2 = _mm_movemask_epi8(sseCmp2);
if (mask1) {
j += trailingZeros(mask1);
return j;
}
if (mask2) {
j += trailingZeros(mask2 << 16);
return j;
}
}
if (j <= img_size.width - 16) {
__m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j));
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero));
if (mask) {
j += trailingZeros(mask);
return j;
}
j += 16;
}
}
#endif
for (; j < img_size.width && src_data[j]; ++j)
;
return j;
}
static int
icvFindContoursInInterval( const CvArr* src,
@ -1339,7 +1497,9 @@ icvFindContoursInInterval( const CvArr* src,
if( contourHeaderSize < (int)sizeof(CvContour))
CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" );
#if CV_SSE2
bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2);
#endif
storage00.reset(cvCreateChildMemStorage(storage));
storage01.reset(cvCreateChildMemStorage(storage));
@ -1372,8 +1532,11 @@ icvFindContoursInInterval( const CvArr* src,
tmp_prev = upper_line;
for( j = 0; j < img_size.width; )
{
for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ )
;
#if CV_SSE2
j = findStartContourPoint(src_data, img_size, j, haveSSE2);
#else
j = findStartContourPoint(src_data, img_size, j);
#endif
if( j == img_size.width )
break;
@ -1381,10 +1544,11 @@ icvFindContoursInInterval( const CvArr* src,
CV_WRITE_SEQ_ELEM( tmp, writer );
tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );
tmp_prev = tmp_prev->next;
for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ )
;
#if CV_SSE2
j = findEndContourPoint(src_data, img_size, j, haveSSE2);
#else
j = findEndContourPoint(src_data, img_size, j);
#endif
tmp.pt.x = j-1;
CV_WRITE_SEQ_ELEM( tmp, writer );
tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );
@ -1407,18 +1571,22 @@ icvFindContoursInInterval( const CvArr* src,
all_total = runs->total;
for( j = 0; j < img_size.width; )
{
for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ )
;
#if CV_SSE2
j = findStartContourPoint(src_data, img_size, j, haveSSE2);
#else
j = findStartContourPoint(src_data, img_size, j);
#endif
if( j == img_size.width ) break;
tmp.pt.x = j;
CV_WRITE_SEQ_ELEM( tmp, writer );
tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );
tmp_prev = tmp_prev->next;
for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ )
;
#if CV_SSE2
j = findEndContourPoint(src_data, img_size, j, haveSSE2);
#else
j = findEndContourPoint(src_data, img_size, j);
#endif
tmp.pt.x = j-1;
CV_WRITE_SEQ_ELEM( tmp, writer );
tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );