From fac3d9994c8cb8308ac2919a459abae349ed2f7e Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 31 Jul 2012 19:07:55 +0400 Subject: [PATCH] integrated another portion of SSE optimizations from Grigory Frolov --- cmake/OpenCVCompilerOptions.cmake | 2 +- .../core/include/opencv2/core/internal.hpp | 12 +- modules/core/src/lapack.cpp | 164 ++++++++--------- modules/imgproc/src/shapedescr.cpp | 172 +++++++++++------- modules/objdetect/src/haar.cpp | 29 ++- 5 files changed, 215 insertions(+), 164 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index c0d109a260..2cfcbf50cd 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -139,7 +139,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSSE3) add_extra_compiler_option(-mssse3) endif() - if(HAVE_GCC43_OR_NEWER) + if(HAVE_GCC43_OR_NEWER OR APPLE) if(ENABLE_SSE41) add_extra_compiler_option(-msse4.1) endif() diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index 2fe56cd7d9..235abcc3c1 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -120,17 +120,23 @@ CV_INLINE IppiSize ippiSize(int width, int height) # else # define CV_SSSE3 0 # endif -# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_SSE4_1 1 +# else +# define CV_SSE4_1 0 # endif -# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_SSE4_2 1 +# else +# define CV_SSE4_2 0 # endif -# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) # include # define CV_AVX 1 +# else +# define CV_AVX 0 # endif # else # define CV_SSE 0 diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 74c6edd3bc..1c76df6413 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -954,7 +954,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) size_t esz = CV_ELEM_SIZE(type); int m = src.rows, n = src.cols; - if( method == DECOMP_SVD ) + if( method == DECOMP_SVD ) { int nm = std::min(m, n); @@ -1010,82 +1010,84 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) if( type == CV_32FC1 ) { double d = det2(Sf); - #if CV_SSE4_2 - if(USE_SSE4_2) - { - __m128 zero = _mm_setzero_ps(); - __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1) - __m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1) - __m128 s0 = _mm_blend_ps(t0,t1,12); - d = 1./d; - result = true; - __m128 det =_mm_set1_ps((float)d); - s0 = _mm_mul_ps(s0, det); - const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0}; - __m128 pattern = _mm_load_ps((const float*)inv); - s0 = _mm_xor_ps(s0, pattern);//==-1*s0 - s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3)); - _mm_storel_pi((__m64*)dstdata, s0); - _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0); - } - #else - if( d != 0. ) + if( d != 0. ) { - double t0, t1; - result = true; - d = 1./d; - t0 = Sf(0,0)*d; - t1 = Sf(1,1)*d; - Df(1,1) = (float)t0; - Df(0,0) = (float)t1; - t0 = -Sf(0,1)*d; - t1 = -Sf(1,0)*d; - Df(0,1) = (float)t0; - Df(1,0) = (float)t1; - } - #endif + result = true; + d = 1./d; + + #if CV_SSE2 + if(USE_SSE2) + { + __m128 zero = _mm_setzero_ps(); + __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1) + __m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1) + __m128 s0 = _mm_or_ps(t0, t1); + __m128 det =_mm_set1_ps((float)d); + s0 = _mm_mul_ps(s0, det); + const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0}; + __m128 pattern = _mm_load_ps((const float*)inv); + s0 = _mm_xor_ps(s0, pattern);//==-1*s0 + s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3)); + _mm_storel_pi((__m64*)dstdata, s0); + _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0); + } + else + #endif + { + double t0, t1; + t0 = Sf(0,0)*d; + t1 = Sf(1,1)*d; + Df(1,1) = (float)t0; + Df(0,0) = (float)t1; + t0 = -Sf(0,1)*d; + t1 = -Sf(1,0)*d; + Df(0,1) = (float)t0; + Df(1,0) = (float)t1; + } + + } } else { double d = det2(Sd); - #if CV_SSE2 - if(USE_SSE2) - { - __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1) - __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1) - __m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal - __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal - result = true; - d = 1./d; - __m128d det = _mm_load1_pd((const double*)&d); - sm = _mm_mul_pd(sm, det); - //__m128d pattern = _mm_set1_pd(-1.); - static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80}; - __m128d pattern = _mm_load1_pd((double*)inv); - ss = _mm_mul_pd(ss, det); - ss = _mm_xor_pd(ss, pattern);//==-1*ss - //ss = _mm_mul_pd(ss,pattern); - s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1)); - s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1)); - _mm_store_pd((double*)dstdata, s0); - _mm_store_pd((double*)(dstdata+dststep), s1); - } - #else - if( d != 0. ) + if( d != 0. ) { - double t0, t1; - result = true; - d = 1./d; - t0 = Sd(0,0)*d; - t1 = Sd(1,1)*d; - Dd(1,1) = t0; - Dd(0,0) = t1; - t0 = -Sd(0,1)*d; - t1 = -Sd(1,0)*d; - Dd(0,1) = t0; - Dd(1,0) = t1; - } - #endif + result = true; + d = 1./d; + #if CV_SSE2 + if(USE_SSE2) + { + __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1) + __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1) + __m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal + __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal + __m128d det = _mm_load1_pd((const double*)&d); + sm = _mm_mul_pd(sm, det); + + uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80}; + __m128d pattern = _mm_load1_pd((double*)inv); + ss = _mm_mul_pd(ss, det); + ss = _mm_xor_pd(ss, pattern);//==-1*ss + + s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1)); + s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1)); + _mm_storeu_pd((double*)dstdata, s0); + _mm_storeu_pd((double*)(dstdata+dststep), s1); + } + else + #endif + { + double t0, t1; + t0 = Sd(0,0)*d; + t1 = Sd(1,1)*d; + Dd(1,1) = t0; + Dd(0,0) = t1; + t0 = -Sd(0,1)*d; + t1 = -Sd(1,0)*d; + Dd(0,1) = t0; + Dd(1,0) = t1; + } + } } } else if( n == 3 ) @@ -1095,18 +1097,17 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) double d = det3(Sf); if( d != 0. ) { - float t[9]; result = true; d = 1./d; - + float t[9]; t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d); t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d); t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d); - + t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d); t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d); t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d); - + t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d); t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d); t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d); @@ -1121,18 +1122,18 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) double d = det3(Sd); if( d != 0. ) { + result = true; + d = 1./d; double t[9]; - result = true; - d = 1./d; t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d; t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d; t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d; - + t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d; t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d; t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d; - + t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d; t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d; t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d; @@ -1171,7 +1172,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) return result; } - int elem_size = CV_ELEM_SIZE(type); + int elem_size = CV_ELEM_SIZE(type); AutoBuffer buf(n*n*elem_size); Mat src1(n, n, type, (uchar*)buf); src.copyTo(src1); @@ -1193,6 +1194,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) } + /****************************************************************************************\ * Solving a linear system * \****************************************************************************************/ @@ -1603,7 +1605,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt, Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat(); int type = w.type(), esz = (int)w.elemSize(); int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n); - size_t wstep = w.rows == 1 ? esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; + size_t wstep = w.rows == 1 ? (size_t)esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; AutoBuffer buffer(nb*sizeof(double) + 16); CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data ); CV_Assert( u.cols >= nm && vt.rows >= nm && diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp index 36c0c6c64f..9a27b9f177 100644 --- a/modules/imgproc/src/shapedescr.cpp +++ b/modules/imgproc/src/shapedescr.cpp @@ -951,9 +951,6 @@ cvBoundingRect( CvArr* array, int update ) if( ptseq->header_size < (int)sizeof(CvContour)) { - /*if( update == 1 ) - CV_Error( CV_StsBadArg, "The header is too small to fit the rectangle, " - "so it could not be updated" );*/ update = 0; calculate = 1; } @@ -1067,86 +1064,123 @@ cvBoundingRect( CvArr* array, int update ) if( xmin >= size.width ) xmin = ymin = 0; - } - else if( ptseq->total ) - { - int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; - cvStartReadSeq( ptseq, &reader, 0 ); + } + else if( ptseq->total ) + { + int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; + cvStartReadSeq( ptseq, &reader, 0 ); + CvPoint pt; + CV_READ_SEQ_ELEM( pt, reader ); + #if CV_SSE4_2 + if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) + { + if( !is_float ) + { + __m128i minval, maxval; + minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y + + for( i = 1; i < ptseq->total; i++) + { + __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr)); + CV_NEXT_SEQ_ELEM(sizeof(pt), reader); + minval = _mm_min_epi32(ptXY, minval); + maxval = _mm_max_epi32(ptXY, maxval); + } + xmin = _mm_cvtsi128_si32(minval); + ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); + xmax = _mm_cvtsi128_si32(maxval); + ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); + } + else + { + __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); + minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); - if( !is_float ) - { - CvPoint pt; - /* init values */ - CV_READ_SEQ_ELEM( pt, reader ); - xmin = xmax = pt.x; - ymin = ymax = pt.y; + for( i = 1; i < ptseq->total; i++ ) + { + ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr); + CV_NEXT_SEQ_ELEM(sizeof(pt), reader); - for( i = 1; i < ptseq->total; i++ ) - { - CV_READ_SEQ_ELEM( pt, reader ); - - if( xmin > pt.x ) - xmin = pt.x; - - if( xmax < pt.x ) - xmax = pt.x; - - if( ymin > pt.y ) - ymin = pt.y; - - if( ymax < pt.y ) - ymax = pt.y; + minvalf = _mm_min_ps(minvalf, ptXY); + maxvalf = _mm_max_ps(maxvalf, ptXY); + } + + float xyminf[2], xymaxf[2]; + _mm_storel_pi((__m64*)xyminf, minvalf); + _mm_storel_pi((__m64*)xymaxf, maxvalf); + xmin = cvFloor(xyminf[0]); + ymin = cvFloor(xyminf[1]); + xmax = cvFloor(xymaxf[0]); + ymax = cvFloor(xymaxf[1]); } - } + } else - { - CvPoint pt; - Cv32suf v; - /* init values */ - CV_READ_SEQ_ELEM( pt, reader ); - xmin = xmax = CV_TOGGLE_FLT(pt.x); - ymin = ymax = CV_TOGGLE_FLT(pt.y); + #endif + { + if( !is_float ) + { + xmin = xmax = pt.x; + ymin = ymax = pt.y; - for( i = 1; i < ptseq->total; i++ ) - { - CV_READ_SEQ_ELEM( pt, reader ); - pt.x = CV_TOGGLE_FLT(pt.x); - pt.y = CV_TOGGLE_FLT(pt.y); + for( i = 1; i < ptseq->total; i++ ) + { + CV_READ_SEQ_ELEM( pt, reader ); - if( xmin > pt.x ) - xmin = pt.x; + if( xmin > pt.x ) + xmin = pt.x; - if( xmax < pt.x ) - xmax = pt.x; + if( xmax < pt.x ) + xmax = pt.x; - if( ymin > pt.y ) - ymin = pt.y; + if( ymin > pt.y ) + ymin = pt.y; - if( ymax < pt.y ) - ymax = pt.y; - } + if( ymax < pt.y ) + ymax = pt.y; + } + } + else + { + Cv32suf v; + // init values + xmin = xmax = CV_TOGGLE_FLT(pt.x); + ymin = ymax = CV_TOGGLE_FLT(pt.y); - v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); - /* because right and bottom sides of - the bounding rectangle are not inclusive - (note +1 in width and height calculation below), - cvFloor is used here instead of cvCeil */ - v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); - } - } + for( i = 1; i < ptseq->total; i++ ) + { + CV_READ_SEQ_ELEM( pt, reader ); + pt.x = CV_TOGGLE_FLT(pt.x); + pt.y = CV_TOGGLE_FLT(pt.y); - rect.x = xmin; - rect.y = ymin; - rect.width = xmax - xmin + 1; - rect.height = ymax - ymin + 1; + if( xmin > pt.x ) + xmin = pt.x; - if( update ) + if( xmax < pt.x ) + xmax = pt.x; + + if( ymin > pt.y ) + ymin = pt.y; + + if( ymax < pt.y ) + ymax = pt.y; + } + + v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); + // because right and bottom sides of the bounding rectangle are not inclusive + // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil + v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); + } + } + rect.x = xmin; + rect.y = ymin; + rect.width = xmax - xmin + 1; + rect.height = ymax - ymin + 1; + } + if( update ) ((CvContour*)ptseq)->rect = rect; - return rect; } - /* End of file. */ diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index 06e89e6263..983fdcf391 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -43,19 +43,26 @@ #include "precomp.hpp" #include - -/*#if CV_SSE2 -# if CV_SSE4 || defined __SSE4__ -# include -# else -# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) -# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) +/* +#if CV_SSE2 +# if !CV_SSE4_1 && !CV_SSE4_2 +# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) +# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) # endif -#if defined CV_ICC -# define CV_HAAR_USE_SSE 1 #endif -#endif*/ +#if defined CV_ICC +# if defined CV_AVX +# define CV_HAAR_USE_AVX 1 +# else +# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2 +# define CV_HAAR_USE_SSE 1 +# else +# define CV_HAAR_NO_SIMD 1 +# endif +# endif +#endif +*/ /* these settings affect the quality of detection: change with care */ #define CV_ADJUST_FEATURES 1 #define CV_ADJUST_WEIGHTS 0 @@ -730,6 +737,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, { CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarTreeNode* node = classifier->node; + #ifndef CV_HAAR_USE_SSE double t = node->threshold*variance_norm_factor; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; @@ -745,6 +753,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, t = _mm_cmpgt_sd(t, sum); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); #endif + } } else