mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 05:29:54 +08:00
StereoSGBM.cpp - use SSE2 for pass 2 using MODE_HH
With a test image set of 2800x1400 bytes on a Intel Core i7 5960X this improves runtime of MODE_HH with about 10%. (this particular replaced code segment is approx 3 times faster than the non-SSE2 variant). I was able to reduce runtime by 130 ms by this simple fix. The second part of the SSE2 optimized part could probably be optimized further by using shift SSE2 operations, but I imagine this would improve performance 10-20 ms at best.
This commit is contained in:
parent
345678770b
commit
551b5d3e1a
@ -758,6 +758,41 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#if CV_SSE2
|
||||
if( useSIMD )
|
||||
{
|
||||
__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
|
||||
__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
|
||||
|
||||
for( d = 0; d < D; d+= 8 )
|
||||
{
|
||||
__m128i L0 = _mm_load_si128((const __m128i*)( Sp + d ));
|
||||
__m128i mask = _mm_cmplt_epi16( L0, _minS );
|
||||
_minS = _mm_min_epi16( L0, _minS );
|
||||
_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128( _bestDisp, _d8), mask));
|
||||
_d8 = _mm_adds_epi16(_d8, _8 );
|
||||
}
|
||||
short CV_DECL_ALIGNED(16) bestDispBuf[8];
|
||||
_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
|
||||
short CV_DECL_ALIGNED(16) minSBuf[8];
|
||||
_mm_store_si128((__m128i*)minSBuf, _minS );
|
||||
|
||||
for( int i = 0; i < 8; i++ )
|
||||
{
|
||||
int Sval = minSBuf[ i ];
|
||||
if( Sval <= minS )
|
||||
{
|
||||
if( ( Sval < minS ) || ( bestDispBuf[i] < bestDisp ) )
|
||||
{
|
||||
bestDisp = bestDispBuf[i];
|
||||
}
|
||||
minS = Sval;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
@ -769,6 +804,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( d = 0; d < D; d++ )
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user