StereoSGBM.cpp - use SSE2 for pass 2 using MODE_HH

With a test image set of 2800x1400 bytes on a Intel Core i7 5960X this improves runtime of MODE_HH with about 10%. (this particular replaced code segment is approx 3 times faster than the non-SSE2 variant). I was able to reduce runtime by 130 ms by this simple fix. The second part of the SSE2 optimized part could probably be optimized further by using shift SSE2 operations, but I imagine this would improve performance 10-20 ms at best.
2024-11-25 11:40:44 +08:00 · 2016-01-08 00:32:52 +01:00 · 2016-01-08 00:32:52 +01:00 · 551b5d3e1a
commit 551b5d3e1a
parent 345678770b
1 changed files with 43 additions and 7 deletions
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@ -759,14 +759,50 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                    }
                    else
                    {
-                        for( d = 0; d < D; d++ )
+                    #if CV_SSE2
+                        if( useSIMD )
                        {
-                            int Sval = Sp[d];
-                            if( Sval < minS )
-                            {
-                                minS = Sval;
-                                bestDisp = d;
-                            }
+                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
+                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
+
+                             for( d = 0; d < D; d+= 8 )
+                             {
+                                 __m128i L0 = _mm_load_si128((const __m128i*)( Sp + d ));
+                                 __m128i mask = _mm_cmplt_epi16( L0, _minS );
+                                 _minS = _mm_min_epi16( L0, _minS );
+                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128( _bestDisp, _d8), mask));
+                                 _d8 = _mm_adds_epi16(_d8, _8 );
+                             }
+                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
+                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
+                             short CV_DECL_ALIGNED(16) minSBuf[8];
+                             _mm_store_si128((__m128i*)minSBuf, _minS );
+
+                             for( int i = 0; i < 8; i++ )
+                             {
+                                 int Sval = minSBuf[ i ];
+                                 if( Sval <= minS )
+                                 {
+                                     if( ( Sval < minS ) || ( bestDispBuf[i] < bestDisp ) )
+                                     {
+                                         bestDisp = bestDispBuf[i];
+                                     }
+                                     minS = Sval;
+                                 }
+                             }
+                        }
+                        else
+                    #endif
+                        {
+                           for( d = 0; d < D; d++ )
+                           {
+                               int Sval = Sp[d];
+                               if( Sval < minS )
+                               {
+                                   minS = Sval;
+                                   bestDisp = d;
+                               }
+                           }
                        }
                    }