diff --git a/modules/objdetect/src/haar.avx.cpp b/modules/objdetect/src/haar.avx.cpp new file mode 100644 index 0000000000..23dddfa199 --- /dev/null +++ b/modules/objdetect/src/haar.avx.cpp @@ -0,0 +1,369 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* Haar features calculation */ + +#include "precomp.hpp" +#include "haar.hpp" + +namespace cv_haar_avx +{ + +// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! +#if CV_HAAR_USE_AVX +double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier, + double variance_norm_factor, size_t p_offset) +{ + int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 }; + uchar flags[8] = { 0,0,0,0,0,0,0,0 }; + CvHidHaarTreeNode* nodes[8]; + double res = 0; + uchar exitConditionFlag = 0; + for (;;) + { + float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 }; + nodes[0] = (classifier + 0)->node + idxV[0]; + nodes[1] = (classifier + 1)->node + idxV[1]; + nodes[2] = (classifier + 2)->node + idxV[2]; + nodes[3] = (classifier + 3)->node + idxV[3]; + nodes[4] = (classifier + 4)->node + idxV[4]; + nodes[5] = (classifier + 5)->node + idxV[5]; + nodes[6] = (classifier + 6)->node + idxV[6]; + nodes[7] = (classifier + 7)->node + idxV[7]; + + __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); + + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, + nodes[6]->threshold, + nodes[5]->threshold, + nodes[4]->threshold, + nodes[3]->threshold, + nodes[2]->threshold, + nodes[1]->threshold, + nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), + calc_sumf(nodes[6]->feature.rect[0], p_offset), + calc_sumf(nodes[5]->feature.rect[0], p_offset), + calc_sumf(nodes[4]->feature.rect[0], p_offset), + calc_sumf(nodes[3]->feature.rect[0], p_offset), + calc_sumf(nodes[2]->feature.rect[0], p_offset), + calc_sumf(nodes[1]->feature.rect[0], p_offset), + calc_sumf(nodes[0]->feature.rect[0], p_offset)); + + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, + nodes[6]->feature.rect[0].weight, + nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, + nodes[3]->feature.rect[0].weight, + nodes[2]->feature.rect[0].weight, + nodes[1]->feature.rect[0].weight, + nodes[0]->feature.rect[0].weight); + + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), + calc_sumf(nodes[6]->feature.rect[1], p_offset), + calc_sumf(nodes[5]->feature.rect[1], p_offset), + calc_sumf(nodes[4]->feature.rect[1], p_offset), + calc_sumf(nodes[3]->feature.rect[1], p_offset), + calc_sumf(nodes[2]->feature.rect[1], p_offset), + calc_sumf(nodes[1]->feature.rect[1], p_offset), + calc_sumf(nodes[0]->feature.rect[1], p_offset)); + + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, + nodes[6]->feature.rect[1].weight, + nodes[5]->feature.rect[1].weight, + nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, + nodes[2]->feature.rect[1].weight, + nodes[1]->feature.rect[1].weight, + nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight)); + + if (nodes[0]->feature.rect[2].p0) + tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight; + if (nodes[1]->feature.rect[2].p0) + tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight; + if (nodes[2]->feature.rect[2].p0) + tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight; + if (nodes[3]->feature.rect[2].p0) + tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight; + if (nodes[4]->feature.rect[2].p0) + tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight; + if (nodes[5]->feature.rect[2].p0) + tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight; + if (nodes[6]->feature.rect[2].p0) + tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight; + if (nodes[7]->feature.rect[2].p0) + tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight; + + sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); + + __m256 left = _mm256_set_ps(static_cast(nodes[7]->left), static_cast(nodes[6]->left), + static_cast(nodes[5]->left), static_cast(nodes[4]->left), + static_cast(nodes[3]->left), static_cast(nodes[2]->left), + static_cast(nodes[1]->left), static_cast(nodes[0]->left)); + __m256 right = _mm256_set_ps(static_cast(nodes[7]->right), static_cast(nodes[6]->right), + static_cast(nodes[5]->right), static_cast(nodes[4]->right), + static_cast(nodes[3]->right), static_cast(nodes[2]->right), + static_cast(nodes[1]->right), static_cast(nodes[0]->right)); + + _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ)))); + + for (int i = 0; i < 8; i++) + { + if (idxV[i] <= 0) + { + if (!flags[i]) + { + exitConditionFlag++; + flags[i] = 1; + res += (classifier + i)->alpha[-idxV[i]]; + } + idxV[i] = 0; + } + } + if (exitConditionFlag == 8) + return res; + } +} + +double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier, + double variance_norm_factor, size_t p_offset) +{ + float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 }; + CvHidHaarTreeNode* nodes[8]; + + nodes[0] = classifier[0].node; + nodes[1] = classifier[1].node; + nodes[2] = classifier[2].node; + nodes[3] = classifier[3].node; + nodes[4] = classifier[4].node; + nodes[5] = classifier[5].node; + nodes[6] = classifier[6].node; + nodes[7] = classifier[7].node; + + __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); + + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, + nodes[6]->threshold, + nodes[5]->threshold, + nodes[4]->threshold, + nodes[3]->threshold, + nodes[2]->threshold, + nodes[1]->threshold, + nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), + calc_sumf(nodes[6]->feature.rect[0], p_offset), + calc_sumf(nodes[5]->feature.rect[0], p_offset), + calc_sumf(nodes[4]->feature.rect[0], p_offset), + calc_sumf(nodes[3]->feature.rect[0], p_offset), + calc_sumf(nodes[2]->feature.rect[0], p_offset), + calc_sumf(nodes[1]->feature.rect[0], p_offset), + calc_sumf(nodes[0]->feature.rect[0], p_offset)); + + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, + nodes[6]->feature.rect[0].weight, + nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, + nodes[3]->feature.rect[0].weight, + nodes[2]->feature.rect[0].weight, + nodes[1]->feature.rect[0].weight, + nodes[0]->feature.rect[0].weight); + + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), + calc_sumf(nodes[6]->feature.rect[1], p_offset), + calc_sumf(nodes[5]->feature.rect[1], p_offset), + calc_sumf(nodes[4]->feature.rect[1], p_offset), + calc_sumf(nodes[3]->feature.rect[1], p_offset), + calc_sumf(nodes[2]->feature.rect[1], p_offset), + calc_sumf(nodes[1]->feature.rect[1], p_offset), + calc_sumf(nodes[0]->feature.rect[1], p_offset)); + + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, + nodes[6]->feature.rect[1].weight, + nodes[5]->feature.rect[1].weight, + nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, + nodes[2]->feature.rect[1].weight, + nodes[1]->feature.rect[1].weight, + nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight)); + + if (nodes[0]->feature.rect[2].p0) + tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight; + if (nodes[1]->feature.rect[2].p0) + tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight; + if (nodes[2]->feature.rect[2].p0) + tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight; + if (nodes[3]->feature.rect[2].p0) + tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight; + if (nodes[4]->feature.rect[2].p0) + tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight; + if (nodes[5]->feature.rect[2].p0) + tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight; + if (nodes[6]->feature.rect[2].p0) + tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight; + if (nodes[7]->feature.rect[2].p0) + tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight; + + sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); + + __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0], + classifier[6].alpha[0], + classifier[5].alpha[0], + classifier[4].alpha[0], + classifier[3].alpha[0], + classifier[2].alpha[0], + classifier[1].alpha[0], + classifier[0].alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1], + classifier[6].alpha[1], + classifier[5].alpha[1], + classifier[4].alpha[1], + classifier[3].alpha[1], + classifier[2].alpha[1], + classifier[1].alpha[1], + classifier[0].alpha[1]); + + __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + outBuf = _mm256_hadd_ps(outBuf, outBuf); + _mm256_store_ps(tmp, outBuf); + return (tmp[0] + tmp[4]); +} + +double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier, + double variance_norm_factor, size_t p_offset) +{ + float CV_DECL_ALIGNED(32) buf[8]; + CvHidHaarTreeNode* nodes[8]; + nodes[0] = classifier[0].node; + nodes[1] = classifier[1].node; + nodes[2] = classifier[2].node; + nodes[3] = classifier[3].node; + nodes[4] = classifier[4].node; + nodes[5] = classifier[5].node; + nodes[6] = classifier[6].node; + nodes[7] = classifier[7].node; + + __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); + t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, + nodes[6]->threshold, + nodes[5]->threshold, + nodes[4]->threshold, + nodes[3]->threshold, + nodes[2]->threshold, + nodes[1]->threshold, + nodes[0]->threshold)); + + __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), + calc_sumf(nodes[6]->feature.rect[0], p_offset), + calc_sumf(nodes[5]->feature.rect[0], p_offset), + calc_sumf(nodes[4]->feature.rect[0], p_offset), + calc_sumf(nodes[3]->feature.rect[0], p_offset), + calc_sumf(nodes[2]->feature.rect[0], p_offset), + calc_sumf(nodes[1]->feature.rect[0], p_offset), + calc_sumf(nodes[0]->feature.rect[0], p_offset)); + + __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, + nodes[6]->feature.rect[0].weight, + nodes[5]->feature.rect[0].weight, + nodes[4]->feature.rect[0].weight, + nodes[3]->feature.rect[0].weight, + nodes[2]->feature.rect[0].weight, + nodes[1]->feature.rect[0].weight, + nodes[0]->feature.rect[0].weight); + + __m256 sum = _mm256_mul_ps(offset, weight); + + offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), + calc_sumf(nodes[6]->feature.rect[1], p_offset), + calc_sumf(nodes[5]->feature.rect[1], p_offset), + calc_sumf(nodes[4]->feature.rect[1], p_offset), + calc_sumf(nodes[3]->feature.rect[1], p_offset), + calc_sumf(nodes[2]->feature.rect[1], p_offset), + calc_sumf(nodes[1]->feature.rect[1], p_offset), + calc_sumf(nodes[0]->feature.rect[1], p_offset)); + + weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, + nodes[6]->feature.rect[1].weight, + nodes[5]->feature.rect[1].weight, + nodes[4]->feature.rect[1].weight, + nodes[3]->feature.rect[1].weight, + nodes[2]->feature.rect[1].weight, + nodes[1]->feature.rect[1].weight, + nodes[0]->feature.rect[1].weight); + + sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight)); + + __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0], + classifier[6].alpha[0], + classifier[5].alpha[0], + classifier[4].alpha[0], + classifier[3].alpha[0], + classifier[2].alpha[0], + classifier[1].alpha[0], + classifier[0].alpha[0]); + __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1], + classifier[6].alpha[1], + classifier[5].alpha[1], + classifier[4].alpha[1], + classifier[3].alpha[1], + classifier[2].alpha[1], + classifier[1].alpha[1], + classifier[0].alpha[1]); + + _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ))); + return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]); +} + +#endif //CV_HAAR_USE_AVX + +} + +/* End of file. */ diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index 6b1c8b8ebf..eea71c4424 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -45,6 +45,10 @@ #include "opencv2/imgproc/imgproc_c.h" #include "opencv2/objdetect/objdetect_c.h" #include +#include "haar.hpp" +#if CV_HAAR_FEATURE_MAX_LOCAL != CV_HAAR_FEATURE_MAX + #error CV_HAAR_FEATURE_MAX definition changed. Adjust CV_HAAR_FEATURE_MAX_LOCAL value please. +#endif #if CV_SSE2 # if 1 /*!CV_SSE4_1 && !CV_SSE4_2*/ @@ -53,8 +57,7 @@ # endif #endif -#if 0 /*CV_AVX*/ -# define CV_HAAR_USE_AVX 1 +#if CV_HAAR_USE_AVX # if defined _MSC_VER # pragma warning( disable : 4752 ) # endif @@ -68,38 +71,6 @@ #define CV_ADJUST_FEATURES 1 #define CV_ADJUST_WEIGHTS 0 -typedef int sumtype; -typedef double sqsumtype; - -typedef struct CvHidHaarFeature -{ - struct - { - sumtype *p0, *p1, *p2, *p3; - float weight; - } - rect[CV_HAAR_FEATURE_MAX]; -} CvHidHaarFeature; - - -typedef struct CvHidHaarTreeNode -{ - CvHidHaarFeature feature; - float threshold; - int left; - int right; -} CvHidHaarTreeNode; - - -typedef struct CvHidHaarClassifier -{ - int count; - //CvHaarFeature* orig_feature; - CvHidHaarTreeNode* node; - float* alpha; -} CvHidHaarClassifier; - - typedef struct CvHidHaarStageClassifier { int count; @@ -420,10 +391,6 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade ) #define calc_sum(rect,offset) \ ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) -#define calc_sumf(rect,offset) \ - static_cast((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) - - CV_IMPL void cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, const CvArr* _sum, @@ -640,129 +607,6 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, } -// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! -#ifdef CV_HAAR_USE_AVX -CV_INLINE -double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, - double variance_norm_factor, size_t p_offset ) -{ - int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; - uchar flags[8] = {0,0,0,0,0,0,0,0}; - CvHidHaarTreeNode* nodes[8]; - double res = 0; - uchar exitConditionFlag = 0; - for(;;) - { - float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; - nodes[0] = (classifier+0)->node + idxV[0]; - nodes[1] = (classifier+1)->node + idxV[1]; - nodes[2] = (classifier+2)->node + idxV[2]; - nodes[3] = (classifier+3)->node + idxV[3]; - nodes[4] = (classifier+4)->node + idxV[4]; - nodes[5] = (classifier+5)->node + idxV[5]; - nodes[6] = (classifier+6)->node + idxV[6]; - nodes[7] = (classifier+7)->node + idxV[7]; - - __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); - - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, - nodes[6]->threshold, - nodes[5]->threshold, - nodes[4]->threshold, - nodes[3]->threshold, - nodes[2]->threshold, - nodes[1]->threshold, - nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), - calc_sumf(nodes[6]->feature.rect[0], p_offset), - calc_sumf(nodes[5]->feature.rect[0], p_offset), - calc_sumf(nodes[4]->feature.rect[0], p_offset), - calc_sumf(nodes[3]->feature.rect[0], p_offset), - calc_sumf(nodes[2]->feature.rect[0], p_offset), - calc_sumf(nodes[1]->feature.rect[0], p_offset), - calc_sumf(nodes[0]->feature.rect[0], p_offset)); - - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, - nodes[6]->feature.rect[0].weight, - nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, - nodes[3]->feature.rect[0].weight, - nodes[2]->feature.rect[0].weight, - nodes[1]->feature.rect[0].weight, - nodes[0]->feature.rect[0].weight); - - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), - calc_sumf(nodes[6]->feature.rect[1], p_offset), - calc_sumf(nodes[5]->feature.rect[1], p_offset), - calc_sumf(nodes[4]->feature.rect[1], p_offset), - calc_sumf(nodes[3]->feature.rect[1], p_offset), - calc_sumf(nodes[2]->feature.rect[1], p_offset), - calc_sumf(nodes[1]->feature.rect[1], p_offset), - calc_sumf(nodes[0]->feature.rect[1], p_offset)); - - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, - nodes[6]->feature.rect[1].weight, - nodes[5]->feature.rect[1].weight, - nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, - nodes[2]->feature.rect[1].weight, - nodes[1]->feature.rect[1].weight, - nodes[0]->feature.rect[1].weight); - - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight)); - - if( nodes[0]->feature.rect[2].p0 ) - tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight; - if( nodes[1]->feature.rect[2].p0 ) - tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight; - if( nodes[2]->feature.rect[2].p0 ) - tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight; - if( nodes[3]->feature.rect[2].p0 ) - tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight; - if( nodes[4]->feature.rect[2].p0 ) - tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight; - if( nodes[5]->feature.rect[2].p0 ) - tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight; - if( nodes[6]->feature.rect[2].p0 ) - tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight; - if( nodes[7]->feature.rect[2].p0 ) - tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight; - - sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); - - __m256 left = _mm256_set_ps(static_cast(nodes[7]->left), static_cast(nodes[6]->left), - static_cast(nodes[5]->left), static_cast(nodes[4]->left), - static_cast(nodes[3]->left), static_cast(nodes[2]->left), - static_cast(nodes[1]->left), static_cast(nodes[0]->left)); - __m256 right = _mm256_set_ps(static_cast(nodes[7]->right),static_cast(nodes[6]->right), - static_cast(nodes[5]->right),static_cast(nodes[4]->right), - static_cast(nodes[3]->right),static_cast(nodes[2]->right), - static_cast(nodes[1]->right),static_cast(nodes[0]->right)); - - _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ)))); - - for(int i = 0; i < 8; i++) - { - if(idxV[i]<=0) - { - if(!flags[i]) - { - exitConditionFlag++; - flags[i] = 1; - res += (classifier+i)->alpha[-idxV[i]]; - } - idxV[i]=0; - } - } - if(exitConditionFlag == 8) - return res; - } -} -#endif //CV_HAAR_USE_AVX - CV_INLINE double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, double variance_norm_factor, @@ -823,8 +667,8 @@ static int cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, CvPoint pt, double& stage_sum, int start_stage ) { -#ifdef CV_HAAR_USE_AVX - bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX); +#if CV_HAAR_USE_AVX + bool haveAVX = CV_CPU_HAS_SUPPORT_AVX; #else # ifdef CV_HAAR_USE_SSE bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); @@ -870,14 +714,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, stage_sum = 0.0; j = 0; -#ifdef CV_HAAR_USE_AVX +#if CV_HAAR_USE_AVX if(haveAVX) { for( ; j <= ptr->count - 8; j += 8 ) { - stage_sum += icvEvalHidHaarClassifierAVX( - ptr->classifier + j, - variance_norm_factor, p_offset ); + stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX( + ptr->classifier + j, + variance_norm_factor, p_offset ); } } #endif @@ -901,106 +745,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, } else if( cascade->isStumpBased ) { -#ifdef CV_HAAR_USE_AVX +#if CV_HAAR_USE_AVX if(haveAVX) { - CvHidHaarClassifier* classifiers[8]; - CvHidHaarTreeNode* nodes[8]; for( i = start_stage; i < cascade->count; i++ ) { stage_sum = 0.0; j = 0; - float CV_DECL_ALIGNED(32) buf[8]; if( cascade->stage_classifier[i].two_rects ) { for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 ) { - classifiers[0] = cascade->stage_classifier[i].classifier + j; - nodes[0] = classifiers[0]->node; - classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; - nodes[1] = classifiers[1]->node; - classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; - nodes[2] = classifiers[2]->node; - classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; - nodes[3] = classifiers[3]->node; - classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; - nodes[4] = classifiers[4]->node; - classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; - nodes[5] = classifiers[5]->node; - classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; - nodes[6] = classifiers[6]->node; - classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; - nodes[7] = classifiers[7]->node; - - __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, - nodes[6]->threshold, - nodes[5]->threshold, - nodes[4]->threshold, - nodes[3]->threshold, - nodes[2]->threshold, - nodes[1]->threshold, - nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), - calc_sumf(nodes[6]->feature.rect[0], p_offset), - calc_sumf(nodes[5]->feature.rect[0], p_offset), - calc_sumf(nodes[4]->feature.rect[0], p_offset), - calc_sumf(nodes[3]->feature.rect[0], p_offset), - calc_sumf(nodes[2]->feature.rect[0], p_offset), - calc_sumf(nodes[1]->feature.rect[0], p_offset), - calc_sumf(nodes[0]->feature.rect[0], p_offset)); - - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, - nodes[6]->feature.rect[0].weight, - nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, - nodes[3]->feature.rect[0].weight, - nodes[2]->feature.rect[0].weight, - nodes[1]->feature.rect[0].weight, - nodes[0]->feature.rect[0].weight); - - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), - calc_sumf(nodes[6]->feature.rect[1], p_offset), - calc_sumf(nodes[5]->feature.rect[1], p_offset), - calc_sumf(nodes[4]->feature.rect[1], p_offset), - calc_sumf(nodes[3]->feature.rect[1], p_offset), - calc_sumf(nodes[2]->feature.rect[1], p_offset), - calc_sumf(nodes[1]->feature.rect[1], p_offset), - calc_sumf(nodes[0]->feature.rect[1], p_offset)); - - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, - nodes[6]->feature.rect[1].weight, - nodes[5]->feature.rect[1].weight, - nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, - nodes[2]->feature.rect[1].weight, - nodes[1]->feature.rect[1].weight, - nodes[0]->feature.rect[1].weight); - - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); - - __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0], - classifiers[6]->alpha[0], - classifiers[5]->alpha[0], - classifiers[4]->alpha[0], - classifiers[3]->alpha[0], - classifiers[2]->alpha[0], - classifiers[1]->alpha[0], - classifiers[0]->alpha[0]); - __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1], - classifiers[6]->alpha[1], - classifiers[5]->alpha[1], - classifiers[4]->alpha[1], - classifiers[3]->alpha[1], - classifiers[2]->alpha[1], - classifiers[1]->alpha[1], - classifiers[0]->alpha[1]); - - _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ))); - stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierTwoRectAVX( + cascade->stage_classifier[i].classifier + j, + variance_norm_factor, p_offset); } for( ; j < cascade->stage_classifier[i].count; j++ ) @@ -1018,117 +776,9 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, { for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) { - float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; - - classifiers[0] = cascade->stage_classifier[i].classifier + j; - nodes[0] = classifiers[0]->node; - classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; - nodes[1] = classifiers[1]->node; - classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; - nodes[2] = classifiers[2]->node; - classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; - nodes[3] = classifiers[3]->node; - classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; - nodes[4] = classifiers[4]->node; - classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; - nodes[5] = classifiers[5]->node; - classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; - nodes[6] = classifiers[6]->node; - classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; - nodes[7] = classifiers[7]->node; - - __m256 t = _mm256_set1_ps(static_cast(variance_norm_factor)); - - t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold, - nodes[6]->threshold, - nodes[5]->threshold, - nodes[4]->threshold, - nodes[3]->threshold, - nodes[2]->threshold, - nodes[1]->threshold, - nodes[0]->threshold)); - - __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset), - calc_sumf(nodes[6]->feature.rect[0], p_offset), - calc_sumf(nodes[5]->feature.rect[0], p_offset), - calc_sumf(nodes[4]->feature.rect[0], p_offset), - calc_sumf(nodes[3]->feature.rect[0], p_offset), - calc_sumf(nodes[2]->feature.rect[0], p_offset), - calc_sumf(nodes[1]->feature.rect[0], p_offset), - calc_sumf(nodes[0]->feature.rect[0], p_offset)); - - __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, - nodes[6]->feature.rect[0].weight, - nodes[5]->feature.rect[0].weight, - nodes[4]->feature.rect[0].weight, - nodes[3]->feature.rect[0].weight, - nodes[2]->feature.rect[0].weight, - nodes[1]->feature.rect[0].weight, - nodes[0]->feature.rect[0].weight); - - __m256 sum = _mm256_mul_ps(offset, weight); - - offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset), - calc_sumf(nodes[6]->feature.rect[1], p_offset), - calc_sumf(nodes[5]->feature.rect[1], p_offset), - calc_sumf(nodes[4]->feature.rect[1], p_offset), - calc_sumf(nodes[3]->feature.rect[1], p_offset), - calc_sumf(nodes[2]->feature.rect[1], p_offset), - calc_sumf(nodes[1]->feature.rect[1], p_offset), - calc_sumf(nodes[0]->feature.rect[1], p_offset)); - - weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, - nodes[6]->feature.rect[1].weight, - nodes[5]->feature.rect[1].weight, - nodes[4]->feature.rect[1].weight, - nodes[3]->feature.rect[1].weight, - nodes[2]->feature.rect[1].weight, - nodes[1]->feature.rect[1].weight, - nodes[0]->feature.rect[1].weight); - - sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight)); - - if( nodes[0]->feature.rect[2].p0 ) - tmp[0] = calc_sumf(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; - if( nodes[1]->feature.rect[2].p0 ) - tmp[1] = calc_sumf(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; - if( nodes[2]->feature.rect[2].p0 ) - tmp[2] = calc_sumf(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; - if( nodes[3]->feature.rect[2].p0 ) - tmp[3] = calc_sumf(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; - if( nodes[4]->feature.rect[2].p0 ) - tmp[4] = calc_sumf(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; - if( nodes[5]->feature.rect[2].p0 ) - tmp[5] = calc_sumf(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; - if( nodes[6]->feature.rect[2].p0 ) - tmp[6] = calc_sumf(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; - if( nodes[7]->feature.rect[2].p0 ) - tmp[7] = calc_sumf(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; - - sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); - - __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0], - classifiers[6]->alpha[0], - classifiers[5]->alpha[0], - classifiers[4]->alpha[0], - classifiers[3]->alpha[0], - classifiers[2]->alpha[0], - classifiers[1]->alpha[0], - classifiers[0]->alpha[0]); - __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1], - classifiers[6]->alpha[1], - classifiers[5]->alpha[1], - classifiers[4]->alpha[1], - classifiers[3]->alpha[1], - classifiers[2]->alpha[1], - classifiers[1]->alpha[1], - classifiers[0]->alpha[1]); - - __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); - outBuf = _mm256_hadd_ps(outBuf, outBuf); - outBuf = _mm256_hadd_ps(outBuf, outBuf); - _mm256_store_ps(buf, outBuf); - stage_sum += (buf[0] + buf[4]); + stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierAVX( + cascade->stage_classifier[i].classifier + j, + variance_norm_factor, p_offset); } for( ; j < cascade->stage_classifier[i].count; j++ ) @@ -1241,14 +891,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, stage_sum = 0.0; int k = 0; -#ifdef CV_HAAR_USE_AVX +#if CV_HAAR_USE_AVX if(haveAVX) { for( ; k < cascade->stage_classifier[i].count - 8; k += 8 ) { - stage_sum += icvEvalHidHaarClassifierAVX( - cascade->stage_classifier[i].classifier + k, - variance_norm_factor, p_offset ); + stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX( + cascade->stage_classifier[i].classifier + k, + variance_norm_factor, p_offset ); } } #endif diff --git a/modules/objdetect/src/haar.hpp b/modules/objdetect/src/haar.hpp new file mode 100644 index 0000000000..72a0af451d --- /dev/null +++ b/modules/objdetect/src/haar.hpp @@ -0,0 +1,101 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* Haar features calculation */ + +#ifndef OPENCV_OBJDETECT_HAAR_HPP +#define OPENCV_OBJDETECT_HAAR_HPP + +#define CV_HAAR_FEATURE_MAX_LOCAL 3 + +typedef int sumtype; +typedef double sqsumtype; + +typedef struct CvHidHaarFeature +{ + struct + { + sumtype *p0, *p1, *p2, *p3; + float weight; + } + rect[CV_HAAR_FEATURE_MAX_LOCAL]; +} CvHidHaarFeature; + + +typedef struct CvHidHaarTreeNode +{ + CvHidHaarFeature feature; + float threshold; + int left; + int right; +} CvHidHaarTreeNode; + + +typedef struct CvHidHaarClassifier +{ + int count; + //CvHaarFeature* orig_feature; + CvHidHaarTreeNode* node; + float* alpha; +} CvHidHaarClassifier; + +#define calc_sumf(rect,offset) \ + static_cast((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) + +namespace cv_haar_avx +{ +#if 0 /*CV_TRY_AVX*/ + #define CV_HAAR_USE_AVX 1 +#else + #define CV_HAAR_USE_AVX 0 +#endif + +#if CV_HAAR_USE_AVX + // AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! + double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset); + double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset); + double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset); +#endif +} + +#endif + +/* End of file. */