Merge pull request #9042 from terfendail:haar_avx

AVX optimized implementation of haar migrated to separate file
2024-11-29 05:29:54 +08:00 · 2017-07-14 15:05:11 +00:00 · 2017-07-14 15:05:11 +00:00 · 431e2e6d68
commit 431e2e6d68
parent 9439872a62 77264dcca9
3 changed files with 492 additions and 372 deletions
--- a/modules/objdetect/src/haar.avx.cpp
+++ b/modules/objdetect/src/haar.avx.cpp
@ -0,0 +1,369 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of Intel Corporation may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /* Haar features calculation */
 #include "precomp.hpp"
 #include "haar.hpp"
 namespace cv_haar_avx
 {
 // AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
 #if CV_HAAR_USE_AVX
 double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
 {
    int  CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
    uchar flags[8] = { 0,0,0,0,0,0,0,0 };
    CvHidHaarTreeNode* nodes[8];
    double res = 0;
    uchar exitConditionFlag = 0;
    for (;;)
    {
        float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
        nodes[0] = (classifier + 0)->node + idxV[0];
        nodes[1] = (classifier + 1)->node + idxV[1];
        nodes[2] = (classifier + 2)->node + idxV[2];
        nodes[3] = (classifier + 3)->node + idxV[3];
        nodes[4] = (classifier + 4)->node + idxV[4];
        nodes[5] = (classifier + 5)->node + idxV[5];
        nodes[6] = (classifier + 6)->node + idxV[6];
        nodes[7] = (classifier + 7)->node + idxV[7];
        __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
            nodes[6]->threshold,
            nodes[5]->threshold,
            nodes[4]->threshold,
            nodes[3]->threshold,
            nodes[2]->threshold,
            nodes[1]->threshold,
            nodes[0]->threshold));
        __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
            calc_sumf(nodes[6]->feature.rect[0], p_offset),
            calc_sumf(nodes[5]->feature.rect[0], p_offset),
            calc_sumf(nodes[4]->feature.rect[0], p_offset),
            calc_sumf(nodes[3]->feature.rect[0], p_offset),
            calc_sumf(nodes[2]->feature.rect[0], p_offset),
            calc_sumf(nodes[1]->feature.rect[0], p_offset),
            calc_sumf(nodes[0]->feature.rect[0], p_offset));
        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
            nodes[6]->feature.rect[0].weight,
            nodes[5]->feature.rect[0].weight,
            nodes[4]->feature.rect[0].weight,
            nodes[3]->feature.rect[0].weight,
            nodes[2]->feature.rect[0].weight,
            nodes[1]->feature.rect[0].weight,
            nodes[0]->feature.rect[0].weight);
        __m256 sum = _mm256_mul_ps(offset, weight);
        offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
            calc_sumf(nodes[6]->feature.rect[1], p_offset),
            calc_sumf(nodes[5]->feature.rect[1], p_offset),
            calc_sumf(nodes[4]->feature.rect[1], p_offset),
            calc_sumf(nodes[3]->feature.rect[1], p_offset),
            calc_sumf(nodes[2]->feature.rect[1], p_offset),
            calc_sumf(nodes[1]->feature.rect[1], p_offset),
            calc_sumf(nodes[0]->feature.rect[1], p_offset));
        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
            nodes[6]->feature.rect[1].weight,
            nodes[5]->feature.rect[1].weight,
            nodes[4]->feature.rect[1].weight,
            nodes[3]->feature.rect[1].weight,
            nodes[2]->feature.rect[1].weight,
            nodes[1]->feature.rect[1].weight,
            nodes[0]->feature.rect[1].weight);
        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
        if (nodes[0]->feature.rect[2].p0)
            tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
        if (nodes[1]->feature.rect[2].p0)
            tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
        if (nodes[2]->feature.rect[2].p0)
            tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
        if (nodes[3]->feature.rect[2].p0)
            tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
        if (nodes[4]->feature.rect[2].p0)
            tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
        if (nodes[5]->feature.rect[2].p0)
            tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
        if (nodes[6]->feature.rect[2].p0)
            tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
        if (nodes[7]->feature.rect[2].p0)
            tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
        sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
        __m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
            static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
            static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
            static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
        __m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
            static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
            static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
            static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));
        _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
        for (int i = 0; i < 8; i++)
        {
            if (idxV[i] <= 0)
            {
                if (!flags[i])
                {
                    exitConditionFlag++;
                    flags[i] = 1;
                    res += (classifier + i)->alpha[-idxV[i]];
                }
                idxV[i] = 0;
            }
        }
        if (exitConditionFlag == 8)
            return res;
    }
 }
 double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
 {
    float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
    CvHidHaarTreeNode* nodes[8];
    nodes[0] = classifier[0].node;
    nodes[1] = classifier[1].node;
    nodes[2] = classifier[2].node;
    nodes[3] = classifier[3].node;
    nodes[4] = classifier[4].node;
    nodes[5] = classifier[5].node;
    nodes[6] = classifier[6].node;
    nodes[7] = classifier[7].node;
    __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
    t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
        nodes[6]->threshold,
        nodes[5]->threshold,
        nodes[4]->threshold,
        nodes[3]->threshold,
        nodes[2]->threshold,
        nodes[1]->threshold,
        nodes[0]->threshold));
    __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
        calc_sumf(nodes[6]->feature.rect[0], p_offset),
        calc_sumf(nodes[5]->feature.rect[0], p_offset),
        calc_sumf(nodes[4]->feature.rect[0], p_offset),
        calc_sumf(nodes[3]->feature.rect[0], p_offset),
        calc_sumf(nodes[2]->feature.rect[0], p_offset),
        calc_sumf(nodes[1]->feature.rect[0], p_offset),
        calc_sumf(nodes[0]->feature.rect[0], p_offset));
    __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
        nodes[6]->feature.rect[0].weight,
        nodes[5]->feature.rect[0].weight,
        nodes[4]->feature.rect[0].weight,
        nodes[3]->feature.rect[0].weight,
        nodes[2]->feature.rect[0].weight,
        nodes[1]->feature.rect[0].weight,
        nodes[0]->feature.rect[0].weight);
    __m256 sum = _mm256_mul_ps(offset, weight);
    offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
        calc_sumf(nodes[6]->feature.rect[1], p_offset),
        calc_sumf(nodes[5]->feature.rect[1], p_offset),
        calc_sumf(nodes[4]->feature.rect[1], p_offset),
        calc_sumf(nodes[3]->feature.rect[1], p_offset),
        calc_sumf(nodes[2]->feature.rect[1], p_offset),
        calc_sumf(nodes[1]->feature.rect[1], p_offset),
        calc_sumf(nodes[0]->feature.rect[1], p_offset));
    weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
        nodes[6]->feature.rect[1].weight,
        nodes[5]->feature.rect[1].weight,
        nodes[4]->feature.rect[1].weight,
        nodes[3]->feature.rect[1].weight,
        nodes[2]->feature.rect[1].weight,
        nodes[1]->feature.rect[1].weight,
        nodes[0]->feature.rect[1].weight);
    sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
    if (nodes[0]->feature.rect[2].p0)
        tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
    if (nodes[1]->feature.rect[2].p0)
        tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
    if (nodes[2]->feature.rect[2].p0)
        tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
    if (nodes[3]->feature.rect[2].p0)
        tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
    if (nodes[4]->feature.rect[2].p0)
        tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
    if (nodes[5]->feature.rect[2].p0)
        tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
    if (nodes[6]->feature.rect[2].p0)
        tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
    if (nodes[7]->feature.rect[2].p0)
        tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
    sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
    __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
        classifier[6].alpha[0],
        classifier[5].alpha[0],
        classifier[4].alpha[0],
        classifier[3].alpha[0],
        classifier[2].alpha[0],
        classifier[1].alpha[0],
        classifier[0].alpha[0]);
    __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
        classifier[6].alpha[1],
        classifier[5].alpha[1],
        classifier[4].alpha[1],
        classifier[3].alpha[1],
        classifier[2].alpha[1],
        classifier[1].alpha[1],
        classifier[0].alpha[1]);
    __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
    outBuf = _mm256_hadd_ps(outBuf, outBuf);
    outBuf = _mm256_hadd_ps(outBuf, outBuf);
    _mm256_store_ps(tmp, outBuf);
    return (tmp[0] + tmp[4]);
 }
 double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
 {
    float CV_DECL_ALIGNED(32) buf[8];
    CvHidHaarTreeNode* nodes[8];
    nodes[0] = classifier[0].node;
    nodes[1] = classifier[1].node;
    nodes[2] = classifier[2].node;
    nodes[3] = classifier[3].node;
    nodes[4] = classifier[4].node;
    nodes[5] = classifier[5].node;
    nodes[6] = classifier[6].node;
    nodes[7] = classifier[7].node;
    __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
    t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
        nodes[6]->threshold,
        nodes[5]->threshold,
        nodes[4]->threshold,
        nodes[3]->threshold,
        nodes[2]->threshold,
        nodes[1]->threshold,
        nodes[0]->threshold));
    __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
        calc_sumf(nodes[6]->feature.rect[0], p_offset),
        calc_sumf(nodes[5]->feature.rect[0], p_offset),
        calc_sumf(nodes[4]->feature.rect[0], p_offset),
        calc_sumf(nodes[3]->feature.rect[0], p_offset),
        calc_sumf(nodes[2]->feature.rect[0], p_offset),
        calc_sumf(nodes[1]->feature.rect[0], p_offset),
        calc_sumf(nodes[0]->feature.rect[0], p_offset));
    __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
        nodes[6]->feature.rect[0].weight,
        nodes[5]->feature.rect[0].weight,
        nodes[4]->feature.rect[0].weight,
        nodes[3]->feature.rect[0].weight,
        nodes[2]->feature.rect[0].weight,
        nodes[1]->feature.rect[0].weight,
        nodes[0]->feature.rect[0].weight);
    __m256 sum = _mm256_mul_ps(offset, weight);
    offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
        calc_sumf(nodes[6]->feature.rect[1], p_offset),
        calc_sumf(nodes[5]->feature.rect[1], p_offset),
        calc_sumf(nodes[4]->feature.rect[1], p_offset),
        calc_sumf(nodes[3]->feature.rect[1], p_offset),
        calc_sumf(nodes[2]->feature.rect[1], p_offset),
        calc_sumf(nodes[1]->feature.rect[1], p_offset),
        calc_sumf(nodes[0]->feature.rect[1], p_offset));
    weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
        nodes[6]->feature.rect[1].weight,
        nodes[5]->feature.rect[1].weight,
        nodes[4]->feature.rect[1].weight,
        nodes[3]->feature.rect[1].weight,
        nodes[2]->feature.rect[1].weight,
        nodes[1]->feature.rect[1].weight,
        nodes[0]->feature.rect[1].weight);
    sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
    __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
        classifier[6].alpha[0],
        classifier[5].alpha[0],
        classifier[4].alpha[0],
        classifier[3].alpha[0],
        classifier[2].alpha[0],
        classifier[1].alpha[0],
        classifier[0].alpha[0]);
    __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
        classifier[6].alpha[1],
        classifier[5].alpha[1],
        classifier[4].alpha[1],
        classifier[3].alpha[1],
        classifier[2].alpha[1],
        classifier[1].alpha[1],
        classifier[0].alpha[1]);
    _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
    return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
 }
 #endif //CV_HAAR_USE_AVX
 }
 /* End of file. */
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@ -45,6 +45,10 @@
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/objdetect/objdetect_c.h"
 #include <stdio.h>
 #include "haar.hpp"
 #if CV_HAAR_FEATURE_MAX_LOCAL != CV_HAAR_FEATURE_MAX
    #error CV_HAAR_FEATURE_MAX definition changed. Adjust CV_HAAR_FEATURE_MAX_LOCAL value please.
 #endif
 #if CV_SSE2
 #   if 1 /*!CV_SSE4_1 && !CV_SSE4_2*/
@ -53,8 +57,7 @@
 #   endif
 #endif
-#if 0 /*CV_AVX*/
+#if CV_HAAR_USE_AVX
 #  define CV_HAAR_USE_AVX 1
 #  if defined _MSC_VER
 #    pragma warning( disable : 4752 )
 #  endif
@ -68,38 +71,6 @@
 #define CV_ADJUST_FEATURES 1
 #define CV_ADJUST_WEIGHTS  0
 typedef int sumtype;
 typedef double sqsumtype;
 typedef struct CvHidHaarFeature
 {
    struct
    {
        sumtype *p0, *p1, *p2, *p3;
        float weight;
    }
    rect[CV_HAAR_FEATURE_MAX];
 } CvHidHaarFeature;
 typedef struct CvHidHaarTreeNode
 {
    CvHidHaarFeature feature;
    float threshold;
    int left;
    int right;
 } CvHidHaarTreeNode;
 typedef struct CvHidHaarClassifier
 {
    int count;
    //CvHaarFeature* orig_feature;
    CvHidHaarTreeNode* node;
    float* alpha;
 } CvHidHaarClassifier;
 typedef struct CvHidHaarStageClassifier
 {
    int  count;
@ -420,10 +391,6 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
 #define calc_sum(rect,offset) \
    ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
 #define calc_sumf(rect,offset) \
    static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
 CV_IMPL void
 cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
                                     const CvArr* _sum,
@ -640,129 +607,6 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
 }
 // AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
 #ifdef CV_HAAR_USE_AVX
 CV_INLINE
 double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
                                    double variance_norm_factor, size_t p_offset )
 {
    int  CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
    uchar flags[8] = {0,0,0,0,0,0,0,0};
    CvHidHaarTreeNode* nodes[8];
    double res = 0;
    uchar exitConditionFlag = 0;
    for(;;)
    {
        float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
        nodes[0] = (classifier+0)->node + idxV[0];
        nodes[1] = (classifier+1)->node + idxV[1];
        nodes[2] = (classifier+2)->node + idxV[2];
        nodes[3] = (classifier+3)->node + idxV[3];
        nodes[4] = (classifier+4)->node + idxV[4];
        nodes[5] = (classifier+5)->node + idxV[5];
        nodes[6] = (classifier+6)->node + idxV[6];
        nodes[7] = (classifier+7)->node + idxV[7];
        __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
                                           nodes[6]->threshold,
                                           nodes[5]->threshold,
                                           nodes[4]->threshold,
                                           nodes[3]->threshold,
                                           nodes[2]->threshold,
                                           nodes[1]->threshold,
                                           nodes[0]->threshold));
        __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[6]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[5]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[4]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[3]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[2]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[1]->feature.rect[0], p_offset),
                                      calc_sumf(nodes[0]->feature.rect[0], p_offset));
        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
                                      nodes[6]->feature.rect[0].weight,
                                      nodes[5]->feature.rect[0].weight,
                                      nodes[4]->feature.rect[0].weight,
                                      nodes[3]->feature.rect[0].weight,
                                      nodes[2]->feature.rect[0].weight,
                                      nodes[1]->feature.rect[0].weight,
                                      nodes[0]->feature.rect[0].weight);
        __m256 sum = _mm256_mul_ps(offset, weight);
        offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
                               calc_sumf(nodes[6]->feature.rect[1], p_offset),
                               calc_sumf(nodes[5]->feature.rect[1], p_offset),
                               calc_sumf(nodes[4]->feature.rect[1], p_offset),
                               calc_sumf(nodes[3]->feature.rect[1], p_offset),
                               calc_sumf(nodes[2]->feature.rect[1], p_offset),
                               calc_sumf(nodes[1]->feature.rect[1], p_offset),
                               calc_sumf(nodes[0]->feature.rect[1], p_offset));
        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
                               nodes[6]->feature.rect[1].weight,
                               nodes[5]->feature.rect[1].weight,
                               nodes[4]->feature.rect[1].weight,
                               nodes[3]->feature.rect[1].weight,
                               nodes[2]->feature.rect[1].weight,
                               nodes[1]->feature.rect[1].weight,
                               nodes[0]->feature.rect[1].weight);
        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
        if( nodes[0]->feature.rect[2].p0 )
            tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
        if( nodes[1]->feature.rect[2].p0 )
            tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
        if( nodes[2]->feature.rect[2].p0 )
            tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
        if( nodes[3]->feature.rect[2].p0 )
            tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
        if( nodes[4]->feature.rect[2].p0 )
            tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
        if( nodes[5]->feature.rect[2].p0 )
            tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
        if( nodes[6]->feature.rect[2].p0 )
            tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
        if( nodes[7]->feature.rect[2].p0 )
            tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
        sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
        __m256 left  = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
                                     static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
                                     static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
                                     static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
        __m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right),static_cast<float>(nodes[6]->right),
                                     static_cast<float>(nodes[5]->right),static_cast<float>(nodes[4]->right),
                                     static_cast<float>(nodes[3]->right),static_cast<float>(nodes[2]->right),
                                     static_cast<float>(nodes[1]->right),static_cast<float>(nodes[0]->right));
        _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
        for(int i = 0; i < 8; i++)
        {
            if(idxV[i]<=0)
            {
                if(!flags[i])
                {
                    exitConditionFlag++;
                    flags[i] = 1;
                    res += (classifier+i)->alpha[-idxV[i]];
                }
                idxV[i]=0;
            }
        }
        if(exitConditionFlag == 8)
            return res;
    }
 }
 #endif //CV_HAAR_USE_AVX
 CV_INLINE
 double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
                                 double variance_norm_factor,
@ -823,8 +667,8 @@ static int
 cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                               CvPoint pt, double& stage_sum, int start_stage )
 {
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
-    bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX);
+    bool haveAVX = CV_CPU_HAS_SUPPORT_AVX;
 #else
 #  ifdef CV_HAAR_USE_SSE
    bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
@ -870,14 +714,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
            stage_sum = 0.0;
            j = 0;
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
            if(haveAVX)
            {
                for( ; j <= ptr->count - 8; j += 8 )
                {
-                    stage_sum += icvEvalHidHaarClassifierAVX(
+                    stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
-                        ptr->classifier + j,
+                                                     ptr->classifier + j,
-                        variance_norm_factor, p_offset );
+                                                     variance_norm_factor, p_offset );
                }
            }
 #endif
@ -901,106 +745,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
    }
    else if( cascade->isStumpBased )
    {
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
        if(haveAVX)
        {
            CvHidHaarClassifier* classifiers[8];
            CvHidHaarTreeNode* nodes[8];
            for( i = start_stage; i < cascade->count; i++ )
            {
                stage_sum = 0.0;
                j = 0;
                float CV_DECL_ALIGNED(32) buf[8];
                if( cascade->stage_classifier[i].two_rects )
                {
                    for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
                    {
-                        classifiers[0] = cascade->stage_classifier[i].classifier + j;
+                        stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierTwoRectAVX(
-                        nodes[0] = classifiers[0]->node;
+                                                         cascade->stage_classifier[i].classifier + j,
-                        classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
+                                                         variance_norm_factor, p_offset);
                        nodes[1] = classifiers[1]->node;
                        classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
                        nodes[2] = classifiers[2]->node;
                        classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
                        nodes[3] = classifiers[3]->node;
                        classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
                        nodes[4] = classifiers[4]->node;
                        classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
                        nodes[5] = classifiers[5]->node;
                        classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
                        nodes[6] = classifiers[6]->node;
                        classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
                        nodes[7] = classifiers[7]->node;
                        __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
                        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
                                                           nodes[6]->threshold,
                                                           nodes[5]->threshold,
                                                           nodes[4]->threshold,
                                                           nodes[3]->threshold,
                                                           nodes[2]->threshold,
                                                           nodes[1]->threshold,
                                                           nodes[0]->threshold));
                        __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[6]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[5]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[4]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[3]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[2]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[1]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[0]->feature.rect[0], p_offset));
                        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
                                                      nodes[6]->feature.rect[0].weight,
                                                      nodes[5]->feature.rect[0].weight,
                                                      nodes[4]->feature.rect[0].weight,
                                                      nodes[3]->feature.rect[0].weight,
                                                      nodes[2]->feature.rect[0].weight,
                                                      nodes[1]->feature.rect[0].weight,
                                                      nodes[0]->feature.rect[0].weight);
                        __m256 sum = _mm256_mul_ps(offset, weight);
                        offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[6]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[5]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[4]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[3]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[2]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[1]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[0]->feature.rect[1], p_offset));
                        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
                                               nodes[6]->feature.rect[1].weight,
                                               nodes[5]->feature.rect[1].weight,
                                               nodes[4]->feature.rect[1].weight,
                                               nodes[3]->feature.rect[1].weight,
                                               nodes[2]->feature.rect[1].weight,
                                               nodes[1]->feature.rect[1].weight,
                                               nodes[0]->feature.rect[1].weight);
                        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
                        __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
                                                      classifiers[6]->alpha[0],
                                                      classifiers[5]->alpha[0],
                                                      classifiers[4]->alpha[0],
                                                      classifiers[3]->alpha[0],
                                                      classifiers[2]->alpha[0],
                                                      classifiers[1]->alpha[0],
                                                      classifiers[0]->alpha[0]);
                        __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
                                                      classifiers[6]->alpha[1],
                                                      classifiers[5]->alpha[1],
                                                      classifiers[4]->alpha[1],
                                                      classifiers[3]->alpha[1],
                                                      classifiers[2]->alpha[1],
                                                      classifiers[1]->alpha[1],
                                                      classifiers[0]->alpha[1]);
                        _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
                        stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
                    }
                    for( ; j < cascade->stage_classifier[i].count; j++ )
@ -1018,117 +776,9 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                {
                    for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
                    {
-                        float  CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
+                        stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierAVX(
-
+                                                         cascade->stage_classifier[i].classifier + j,
-                        classifiers[0] = cascade->stage_classifier[i].classifier + j;
+                                                         variance_norm_factor, p_offset);
                        nodes[0] = classifiers[0]->node;
                        classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
                        nodes[1] = classifiers[1]->node;
                        classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
                        nodes[2] = classifiers[2]->node;
                        classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
                        nodes[3] = classifiers[3]->node;
                        classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
                        nodes[4] = classifiers[4]->node;
                        classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
                        nodes[5] = classifiers[5]->node;
                        classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
                        nodes[6] = classifiers[6]->node;
                        classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
                        nodes[7] = classifiers[7]->node;
                        __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
                        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
                                                           nodes[6]->threshold,
                                                           nodes[5]->threshold,
                                                           nodes[4]->threshold,
                                                           nodes[3]->threshold,
                                                           nodes[2]->threshold,
                                                           nodes[1]->threshold,
                                                           nodes[0]->threshold));
                        __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[6]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[5]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[4]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[3]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[2]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[1]->feature.rect[0], p_offset),
                                                      calc_sumf(nodes[0]->feature.rect[0], p_offset));
                        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
                                                      nodes[6]->feature.rect[0].weight,
                                                      nodes[5]->feature.rect[0].weight,
                                                      nodes[4]->feature.rect[0].weight,
                                                      nodes[3]->feature.rect[0].weight,
                                                      nodes[2]->feature.rect[0].weight,
                                                      nodes[1]->feature.rect[0].weight,
                                                      nodes[0]->feature.rect[0].weight);
                        __m256 sum = _mm256_mul_ps(offset, weight);
                        offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[6]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[5]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[4]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[3]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[2]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[1]->feature.rect[1], p_offset),
                                               calc_sumf(nodes[0]->feature.rect[1], p_offset));
                        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
                                               nodes[6]->feature.rect[1].weight,
                                               nodes[5]->feature.rect[1].weight,
                                               nodes[4]->feature.rect[1].weight,
                                               nodes[3]->feature.rect[1].weight,
                                               nodes[2]->feature.rect[1].weight,
                                               nodes[1]->feature.rect[1].weight,
                                               nodes[0]->feature.rect[1].weight);
                        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
                        if( nodes[0]->feature.rect[2].p0 )
                            tmp[0] = calc_sumf(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
                        if( nodes[1]->feature.rect[2].p0 )
                            tmp[1] = calc_sumf(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
                        if( nodes[2]->feature.rect[2].p0 )
                            tmp[2] = calc_sumf(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
                        if( nodes[3]->feature.rect[2].p0 )
                            tmp[3] = calc_sumf(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
                        if( nodes[4]->feature.rect[2].p0 )
                            tmp[4] = calc_sumf(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
                        if( nodes[5]->feature.rect[2].p0 )
                            tmp[5] = calc_sumf(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
                        if( nodes[6]->feature.rect[2].p0 )
                            tmp[6] = calc_sumf(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
                        if( nodes[7]->feature.rect[2].p0 )
                            tmp[7] = calc_sumf(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
                        sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
                        __m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
                                                      classifiers[6]->alpha[0],
                                                      classifiers[5]->alpha[0],
                                                      classifiers[4]->alpha[0],
                                                      classifiers[3]->alpha[0],
                                                      classifiers[2]->alpha[0],
                                                      classifiers[1]->alpha[0],
                                                      classifiers[0]->alpha[0]);
                        __m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
                                                      classifiers[6]->alpha[1],
                                                      classifiers[5]->alpha[1],
                                                      classifiers[4]->alpha[1],
                                                      classifiers[3]->alpha[1],
                                                      classifiers[2]->alpha[1],
                                                      classifiers[1]->alpha[1],
                                                      classifiers[0]->alpha[1]);
                        __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
                        outBuf = _mm256_hadd_ps(outBuf, outBuf);
                        outBuf = _mm256_hadd_ps(outBuf, outBuf);
                        _mm256_store_ps(buf, outBuf);
                        stage_sum += (buf[0] + buf[4]);
                    }
                    for( ; j < cascade->stage_classifier[i].count; j++ )
@ -1241,14 +891,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
            stage_sum = 0.0;
            int k = 0;
-#ifdef CV_HAAR_USE_AVX
+#if CV_HAAR_USE_AVX
            if(haveAVX)
            {
                for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
                {
-                    stage_sum += icvEvalHidHaarClassifierAVX(
+                    stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
-                        cascade->stage_classifier[i].classifier + k,
+                                                     cascade->stage_classifier[i].classifier + k,
-                        variance_norm_factor, p_offset );
+                                                     variance_norm_factor, p_offset );
                }
            }
 #endif
--- a/modules/objdetect/src/haar.hpp
+++ b/modules/objdetect/src/haar.hpp
@ -0,0 +1,101 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of Intel Corporation may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /* Haar features calculation */
 #ifndef OPENCV_OBJDETECT_HAAR_HPP
 #define OPENCV_OBJDETECT_HAAR_HPP
 #define CV_HAAR_FEATURE_MAX_LOCAL 3
 typedef int sumtype;
 typedef double sqsumtype;
 typedef struct CvHidHaarFeature
 {
    struct
    {
        sumtype *p0, *p1, *p2, *p3;
        float weight;
    }
    rect[CV_HAAR_FEATURE_MAX_LOCAL];
 } CvHidHaarFeature;
 typedef struct CvHidHaarTreeNode
 {
    CvHidHaarFeature feature;
    float threshold;
    int left;
    int right;
 } CvHidHaarTreeNode;
 typedef struct CvHidHaarClassifier
 {
    int count;
    //CvHaarFeature* orig_feature;
    CvHidHaarTreeNode* node;
    float* alpha;
 } CvHidHaarClassifier;
 #define calc_sumf(rect,offset) \
    static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
 namespace cv_haar_avx
 {
 #if 0 /*CV_TRY_AVX*/
    #define CV_HAAR_USE_AVX 1
 #else
    #define CV_HAAR_USE_AVX 0
 #endif
 #if CV_HAAR_USE_AVX
    // AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
    double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
    double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
    double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
 #endif
 }
 #endif
 /* End of file. */