mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 05:29:54 +08:00
Merge pull request #9042 from terfendail:haar_avx
AVX optimized implementation of haar migrated to separate file
This commit is contained in:
commit
431e2e6d68
369
modules/objdetect/src/haar.avx.cpp
Normal file
369
modules/objdetect/src/haar.avx.cpp
Normal file
@ -0,0 +1,369 @@
|
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Intel License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
|
/* Haar features calculation */
|
||||||
|
|
||||||
|
#include "precomp.hpp"
|
||||||
|
#include "haar.hpp"
|
||||||
|
|
||||||
|
namespace cv_haar_avx
|
||||||
|
{
|
||||||
|
|
||||||
|
// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
|
||||||
|
#if CV_HAAR_USE_AVX
|
||||||
|
double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
|
||||||
|
double variance_norm_factor, size_t p_offset)
|
||||||
|
{
|
||||||
|
int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
|
||||||
|
uchar flags[8] = { 0,0,0,0,0,0,0,0 };
|
||||||
|
CvHidHaarTreeNode* nodes[8];
|
||||||
|
double res = 0;
|
||||||
|
uchar exitConditionFlag = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
|
||||||
|
nodes[0] = (classifier + 0)->node + idxV[0];
|
||||||
|
nodes[1] = (classifier + 1)->node + idxV[1];
|
||||||
|
nodes[2] = (classifier + 2)->node + idxV[2];
|
||||||
|
nodes[3] = (classifier + 3)->node + idxV[3];
|
||||||
|
nodes[4] = (classifier + 4)->node + idxV[4];
|
||||||
|
nodes[5] = (classifier + 5)->node + idxV[5];
|
||||||
|
nodes[6] = (classifier + 6)->node + idxV[6];
|
||||||
|
nodes[7] = (classifier + 7)->node + idxV[7];
|
||||||
|
|
||||||
|
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
||||||
|
|
||||||
|
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
||||||
|
nodes[6]->threshold,
|
||||||
|
nodes[5]->threshold,
|
||||||
|
nodes[4]->threshold,
|
||||||
|
nodes[3]->threshold,
|
||||||
|
nodes[2]->threshold,
|
||||||
|
nodes[1]->threshold,
|
||||||
|
nodes[0]->threshold));
|
||||||
|
|
||||||
|
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
||||||
|
|
||||||
|
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
||||||
|
nodes[6]->feature.rect[0].weight,
|
||||||
|
nodes[5]->feature.rect[0].weight,
|
||||||
|
nodes[4]->feature.rect[0].weight,
|
||||||
|
nodes[3]->feature.rect[0].weight,
|
||||||
|
nodes[2]->feature.rect[0].weight,
|
||||||
|
nodes[1]->feature.rect[0].weight,
|
||||||
|
nodes[0]->feature.rect[0].weight);
|
||||||
|
|
||||||
|
__m256 sum = _mm256_mul_ps(offset, weight);
|
||||||
|
|
||||||
|
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
||||||
|
|
||||||
|
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
||||||
|
nodes[6]->feature.rect[1].weight,
|
||||||
|
nodes[5]->feature.rect[1].weight,
|
||||||
|
nodes[4]->feature.rect[1].weight,
|
||||||
|
nodes[3]->feature.rect[1].weight,
|
||||||
|
nodes[2]->feature.rect[1].weight,
|
||||||
|
nodes[1]->feature.rect[1].weight,
|
||||||
|
nodes[0]->feature.rect[1].weight);
|
||||||
|
|
||||||
|
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
|
||||||
|
|
||||||
|
if (nodes[0]->feature.rect[2].p0)
|
||||||
|
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
|
||||||
|
if (nodes[1]->feature.rect[2].p0)
|
||||||
|
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
|
||||||
|
if (nodes[2]->feature.rect[2].p0)
|
||||||
|
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
|
||||||
|
if (nodes[3]->feature.rect[2].p0)
|
||||||
|
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
|
||||||
|
if (nodes[4]->feature.rect[2].p0)
|
||||||
|
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
|
||||||
|
if (nodes[5]->feature.rect[2].p0)
|
||||||
|
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
|
||||||
|
if (nodes[6]->feature.rect[2].p0)
|
||||||
|
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
|
||||||
|
if (nodes[7]->feature.rect[2].p0)
|
||||||
|
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
|
||||||
|
|
||||||
|
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
|
||||||
|
|
||||||
|
__m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
|
||||||
|
static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
|
||||||
|
static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
|
||||||
|
static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
|
||||||
|
__m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
|
||||||
|
static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
|
||||||
|
static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
|
||||||
|
static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));
|
||||||
|
|
||||||
|
_mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++)
|
||||||
|
{
|
||||||
|
if (idxV[i] <= 0)
|
||||||
|
{
|
||||||
|
if (!flags[i])
|
||||||
|
{
|
||||||
|
exitConditionFlag++;
|
||||||
|
flags[i] = 1;
|
||||||
|
res += (classifier + i)->alpha[-idxV[i]];
|
||||||
|
}
|
||||||
|
idxV[i] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (exitConditionFlag == 8)
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
|
||||||
|
double variance_norm_factor, size_t p_offset)
|
||||||
|
{
|
||||||
|
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
|
||||||
|
CvHidHaarTreeNode* nodes[8];
|
||||||
|
|
||||||
|
nodes[0] = classifier[0].node;
|
||||||
|
nodes[1] = classifier[1].node;
|
||||||
|
nodes[2] = classifier[2].node;
|
||||||
|
nodes[3] = classifier[3].node;
|
||||||
|
nodes[4] = classifier[4].node;
|
||||||
|
nodes[5] = classifier[5].node;
|
||||||
|
nodes[6] = classifier[6].node;
|
||||||
|
nodes[7] = classifier[7].node;
|
||||||
|
|
||||||
|
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
||||||
|
|
||||||
|
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
||||||
|
nodes[6]->threshold,
|
||||||
|
nodes[5]->threshold,
|
||||||
|
nodes[4]->threshold,
|
||||||
|
nodes[3]->threshold,
|
||||||
|
nodes[2]->threshold,
|
||||||
|
nodes[1]->threshold,
|
||||||
|
nodes[0]->threshold));
|
||||||
|
|
||||||
|
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
||||||
|
|
||||||
|
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
||||||
|
nodes[6]->feature.rect[0].weight,
|
||||||
|
nodes[5]->feature.rect[0].weight,
|
||||||
|
nodes[4]->feature.rect[0].weight,
|
||||||
|
nodes[3]->feature.rect[0].weight,
|
||||||
|
nodes[2]->feature.rect[0].weight,
|
||||||
|
nodes[1]->feature.rect[0].weight,
|
||||||
|
nodes[0]->feature.rect[0].weight);
|
||||||
|
|
||||||
|
__m256 sum = _mm256_mul_ps(offset, weight);
|
||||||
|
|
||||||
|
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
||||||
|
|
||||||
|
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
||||||
|
nodes[6]->feature.rect[1].weight,
|
||||||
|
nodes[5]->feature.rect[1].weight,
|
||||||
|
nodes[4]->feature.rect[1].weight,
|
||||||
|
nodes[3]->feature.rect[1].weight,
|
||||||
|
nodes[2]->feature.rect[1].weight,
|
||||||
|
nodes[1]->feature.rect[1].weight,
|
||||||
|
nodes[0]->feature.rect[1].weight);
|
||||||
|
|
||||||
|
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
|
||||||
|
|
||||||
|
if (nodes[0]->feature.rect[2].p0)
|
||||||
|
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
|
||||||
|
if (nodes[1]->feature.rect[2].p0)
|
||||||
|
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
|
||||||
|
if (nodes[2]->feature.rect[2].p0)
|
||||||
|
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
|
||||||
|
if (nodes[3]->feature.rect[2].p0)
|
||||||
|
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
|
||||||
|
if (nodes[4]->feature.rect[2].p0)
|
||||||
|
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
|
||||||
|
if (nodes[5]->feature.rect[2].p0)
|
||||||
|
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
|
||||||
|
if (nodes[6]->feature.rect[2].p0)
|
||||||
|
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
|
||||||
|
if (nodes[7]->feature.rect[2].p0)
|
||||||
|
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
|
||||||
|
|
||||||
|
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
|
||||||
|
|
||||||
|
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
|
||||||
|
classifier[6].alpha[0],
|
||||||
|
classifier[5].alpha[0],
|
||||||
|
classifier[4].alpha[0],
|
||||||
|
classifier[3].alpha[0],
|
||||||
|
classifier[2].alpha[0],
|
||||||
|
classifier[1].alpha[0],
|
||||||
|
classifier[0].alpha[0]);
|
||||||
|
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
|
||||||
|
classifier[6].alpha[1],
|
||||||
|
classifier[5].alpha[1],
|
||||||
|
classifier[4].alpha[1],
|
||||||
|
classifier[3].alpha[1],
|
||||||
|
classifier[2].alpha[1],
|
||||||
|
classifier[1].alpha[1],
|
||||||
|
classifier[0].alpha[1]);
|
||||||
|
|
||||||
|
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
|
||||||
|
outBuf = _mm256_hadd_ps(outBuf, outBuf);
|
||||||
|
outBuf = _mm256_hadd_ps(outBuf, outBuf);
|
||||||
|
_mm256_store_ps(tmp, outBuf);
|
||||||
|
return (tmp[0] + tmp[4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
|
||||||
|
double variance_norm_factor, size_t p_offset)
|
||||||
|
{
|
||||||
|
float CV_DECL_ALIGNED(32) buf[8];
|
||||||
|
CvHidHaarTreeNode* nodes[8];
|
||||||
|
nodes[0] = classifier[0].node;
|
||||||
|
nodes[1] = classifier[1].node;
|
||||||
|
nodes[2] = classifier[2].node;
|
||||||
|
nodes[3] = classifier[3].node;
|
||||||
|
nodes[4] = classifier[4].node;
|
||||||
|
nodes[5] = classifier[5].node;
|
||||||
|
nodes[6] = classifier[6].node;
|
||||||
|
nodes[7] = classifier[7].node;
|
||||||
|
|
||||||
|
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
||||||
|
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
||||||
|
nodes[6]->threshold,
|
||||||
|
nodes[5]->threshold,
|
||||||
|
nodes[4]->threshold,
|
||||||
|
nodes[3]->threshold,
|
||||||
|
nodes[2]->threshold,
|
||||||
|
nodes[1]->threshold,
|
||||||
|
nodes[0]->threshold));
|
||||||
|
|
||||||
|
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
||||||
|
|
||||||
|
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
||||||
|
nodes[6]->feature.rect[0].weight,
|
||||||
|
nodes[5]->feature.rect[0].weight,
|
||||||
|
nodes[4]->feature.rect[0].weight,
|
||||||
|
nodes[3]->feature.rect[0].weight,
|
||||||
|
nodes[2]->feature.rect[0].weight,
|
||||||
|
nodes[1]->feature.rect[0].weight,
|
||||||
|
nodes[0]->feature.rect[0].weight);
|
||||||
|
|
||||||
|
__m256 sum = _mm256_mul_ps(offset, weight);
|
||||||
|
|
||||||
|
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
||||||
|
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
||||||
|
|
||||||
|
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
||||||
|
nodes[6]->feature.rect[1].weight,
|
||||||
|
nodes[5]->feature.rect[1].weight,
|
||||||
|
nodes[4]->feature.rect[1].weight,
|
||||||
|
nodes[3]->feature.rect[1].weight,
|
||||||
|
nodes[2]->feature.rect[1].weight,
|
||||||
|
nodes[1]->feature.rect[1].weight,
|
||||||
|
nodes[0]->feature.rect[1].weight);
|
||||||
|
|
||||||
|
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
|
||||||
|
|
||||||
|
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
|
||||||
|
classifier[6].alpha[0],
|
||||||
|
classifier[5].alpha[0],
|
||||||
|
classifier[4].alpha[0],
|
||||||
|
classifier[3].alpha[0],
|
||||||
|
classifier[2].alpha[0],
|
||||||
|
classifier[1].alpha[0],
|
||||||
|
classifier[0].alpha[0]);
|
||||||
|
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
|
||||||
|
classifier[6].alpha[1],
|
||||||
|
classifier[5].alpha[1],
|
||||||
|
classifier[4].alpha[1],
|
||||||
|
classifier[3].alpha[1],
|
||||||
|
classifier[2].alpha[1],
|
||||||
|
classifier[1].alpha[1],
|
||||||
|
classifier[0].alpha[1]);
|
||||||
|
|
||||||
|
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
|
||||||
|
return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif //CV_HAAR_USE_AVX
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* End of file. */
|
@ -45,6 +45,10 @@
|
|||||||
#include "opencv2/imgproc/imgproc_c.h"
|
#include "opencv2/imgproc/imgproc_c.h"
|
||||||
#include "opencv2/objdetect/objdetect_c.h"
|
#include "opencv2/objdetect/objdetect_c.h"
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "haar.hpp"
|
||||||
|
#if CV_HAAR_FEATURE_MAX_LOCAL != CV_HAAR_FEATURE_MAX
|
||||||
|
#error CV_HAAR_FEATURE_MAX definition changed. Adjust CV_HAAR_FEATURE_MAX_LOCAL value please.
|
||||||
|
#endif
|
||||||
|
|
||||||
#if CV_SSE2
|
#if CV_SSE2
|
||||||
# if 1 /*!CV_SSE4_1 && !CV_SSE4_2*/
|
# if 1 /*!CV_SSE4_1 && !CV_SSE4_2*/
|
||||||
@ -53,8 +57,7 @@
|
|||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0 /*CV_AVX*/
|
#if CV_HAAR_USE_AVX
|
||||||
# define CV_HAAR_USE_AVX 1
|
|
||||||
# if defined _MSC_VER
|
# if defined _MSC_VER
|
||||||
# pragma warning( disable : 4752 )
|
# pragma warning( disable : 4752 )
|
||||||
# endif
|
# endif
|
||||||
@ -68,38 +71,6 @@
|
|||||||
#define CV_ADJUST_FEATURES 1
|
#define CV_ADJUST_FEATURES 1
|
||||||
#define CV_ADJUST_WEIGHTS 0
|
#define CV_ADJUST_WEIGHTS 0
|
||||||
|
|
||||||
typedef int sumtype;
|
|
||||||
typedef double sqsumtype;
|
|
||||||
|
|
||||||
typedef struct CvHidHaarFeature
|
|
||||||
{
|
|
||||||
struct
|
|
||||||
{
|
|
||||||
sumtype *p0, *p1, *p2, *p3;
|
|
||||||
float weight;
|
|
||||||
}
|
|
||||||
rect[CV_HAAR_FEATURE_MAX];
|
|
||||||
} CvHidHaarFeature;
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct CvHidHaarTreeNode
|
|
||||||
{
|
|
||||||
CvHidHaarFeature feature;
|
|
||||||
float threshold;
|
|
||||||
int left;
|
|
||||||
int right;
|
|
||||||
} CvHidHaarTreeNode;
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct CvHidHaarClassifier
|
|
||||||
{
|
|
||||||
int count;
|
|
||||||
//CvHaarFeature* orig_feature;
|
|
||||||
CvHidHaarTreeNode* node;
|
|
||||||
float* alpha;
|
|
||||||
} CvHidHaarClassifier;
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct CvHidHaarStageClassifier
|
typedef struct CvHidHaarStageClassifier
|
||||||
{
|
{
|
||||||
int count;
|
int count;
|
||||||
@ -420,10 +391,6 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
|
|||||||
#define calc_sum(rect,offset) \
|
#define calc_sum(rect,offset) \
|
||||||
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
|
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
|
||||||
|
|
||||||
#define calc_sumf(rect,offset) \
|
|
||||||
static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
|
|
||||||
|
|
||||||
|
|
||||||
CV_IMPL void
|
CV_IMPL void
|
||||||
cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
|
cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
|
||||||
const CvArr* _sum,
|
const CvArr* _sum,
|
||||||
@ -640,129 +607,6 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
|
|
||||||
#ifdef CV_HAAR_USE_AVX
|
|
||||||
CV_INLINE
|
|
||||||
double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
|
|
||||||
double variance_norm_factor, size_t p_offset )
|
|
||||||
{
|
|
||||||
int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
|
|
||||||
uchar flags[8] = {0,0,0,0,0,0,0,0};
|
|
||||||
CvHidHaarTreeNode* nodes[8];
|
|
||||||
double res = 0;
|
|
||||||
uchar exitConditionFlag = 0;
|
|
||||||
for(;;)
|
|
||||||
{
|
|
||||||
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
|
|
||||||
nodes[0] = (classifier+0)->node + idxV[0];
|
|
||||||
nodes[1] = (classifier+1)->node + idxV[1];
|
|
||||||
nodes[2] = (classifier+2)->node + idxV[2];
|
|
||||||
nodes[3] = (classifier+3)->node + idxV[3];
|
|
||||||
nodes[4] = (classifier+4)->node + idxV[4];
|
|
||||||
nodes[5] = (classifier+5)->node + idxV[5];
|
|
||||||
nodes[6] = (classifier+6)->node + idxV[6];
|
|
||||||
nodes[7] = (classifier+7)->node + idxV[7];
|
|
||||||
|
|
||||||
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
|
||||||
|
|
||||||
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
|
||||||
nodes[6]->threshold,
|
|
||||||
nodes[5]->threshold,
|
|
||||||
nodes[4]->threshold,
|
|
||||||
nodes[3]->threshold,
|
|
||||||
nodes[2]->threshold,
|
|
||||||
nodes[1]->threshold,
|
|
||||||
nodes[0]->threshold));
|
|
||||||
|
|
||||||
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
|
||||||
|
|
||||||
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
|
||||||
nodes[6]->feature.rect[0].weight,
|
|
||||||
nodes[5]->feature.rect[0].weight,
|
|
||||||
nodes[4]->feature.rect[0].weight,
|
|
||||||
nodes[3]->feature.rect[0].weight,
|
|
||||||
nodes[2]->feature.rect[0].weight,
|
|
||||||
nodes[1]->feature.rect[0].weight,
|
|
||||||
nodes[0]->feature.rect[0].weight);
|
|
||||||
|
|
||||||
__m256 sum = _mm256_mul_ps(offset, weight);
|
|
||||||
|
|
||||||
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
|
||||||
|
|
||||||
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
|
||||||
nodes[6]->feature.rect[1].weight,
|
|
||||||
nodes[5]->feature.rect[1].weight,
|
|
||||||
nodes[4]->feature.rect[1].weight,
|
|
||||||
nodes[3]->feature.rect[1].weight,
|
|
||||||
nodes[2]->feature.rect[1].weight,
|
|
||||||
nodes[1]->feature.rect[1].weight,
|
|
||||||
nodes[0]->feature.rect[1].weight);
|
|
||||||
|
|
||||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
|
|
||||||
|
|
||||||
if( nodes[0]->feature.rect[2].p0 )
|
|
||||||
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
|
|
||||||
if( nodes[1]->feature.rect[2].p0 )
|
|
||||||
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
|
|
||||||
if( nodes[2]->feature.rect[2].p0 )
|
|
||||||
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
|
|
||||||
if( nodes[3]->feature.rect[2].p0 )
|
|
||||||
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
|
|
||||||
if( nodes[4]->feature.rect[2].p0 )
|
|
||||||
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
|
|
||||||
if( nodes[5]->feature.rect[2].p0 )
|
|
||||||
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
|
|
||||||
if( nodes[6]->feature.rect[2].p0 )
|
|
||||||
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
|
|
||||||
if( nodes[7]->feature.rect[2].p0 )
|
|
||||||
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
|
|
||||||
|
|
||||||
sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
|
|
||||||
|
|
||||||
__m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
|
|
||||||
static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
|
|
||||||
static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
|
|
||||||
static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
|
|
||||||
__m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right),static_cast<float>(nodes[6]->right),
|
|
||||||
static_cast<float>(nodes[5]->right),static_cast<float>(nodes[4]->right),
|
|
||||||
static_cast<float>(nodes[3]->right),static_cast<float>(nodes[2]->right),
|
|
||||||
static_cast<float>(nodes[1]->right),static_cast<float>(nodes[0]->right));
|
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
|
|
||||||
|
|
||||||
for(int i = 0; i < 8; i++)
|
|
||||||
{
|
|
||||||
if(idxV[i]<=0)
|
|
||||||
{
|
|
||||||
if(!flags[i])
|
|
||||||
{
|
|
||||||
exitConditionFlag++;
|
|
||||||
flags[i] = 1;
|
|
||||||
res += (classifier+i)->alpha[-idxV[i]];
|
|
||||||
}
|
|
||||||
idxV[i]=0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(exitConditionFlag == 8)
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif //CV_HAAR_USE_AVX
|
|
||||||
|
|
||||||
CV_INLINE
|
CV_INLINE
|
||||||
double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
|
double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
|
||||||
double variance_norm_factor,
|
double variance_norm_factor,
|
||||||
@ -823,8 +667,8 @@ static int
|
|||||||
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
||||||
CvPoint pt, double& stage_sum, int start_stage )
|
CvPoint pt, double& stage_sum, int start_stage )
|
||||||
{
|
{
|
||||||
#ifdef CV_HAAR_USE_AVX
|
#if CV_HAAR_USE_AVX
|
||||||
bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX);
|
bool haveAVX = CV_CPU_HAS_SUPPORT_AVX;
|
||||||
#else
|
#else
|
||||||
# ifdef CV_HAAR_USE_SSE
|
# ifdef CV_HAAR_USE_SSE
|
||||||
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
|
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
|
||||||
@ -870,14 +714,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
|||||||
stage_sum = 0.0;
|
stage_sum = 0.0;
|
||||||
j = 0;
|
j = 0;
|
||||||
|
|
||||||
#ifdef CV_HAAR_USE_AVX
|
#if CV_HAAR_USE_AVX
|
||||||
if(haveAVX)
|
if(haveAVX)
|
||||||
{
|
{
|
||||||
for( ; j <= ptr->count - 8; j += 8 )
|
for( ; j <= ptr->count - 8; j += 8 )
|
||||||
{
|
{
|
||||||
stage_sum += icvEvalHidHaarClassifierAVX(
|
stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
|
||||||
ptr->classifier + j,
|
ptr->classifier + j,
|
||||||
variance_norm_factor, p_offset );
|
variance_norm_factor, p_offset );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -901,106 +745,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
|||||||
}
|
}
|
||||||
else if( cascade->isStumpBased )
|
else if( cascade->isStumpBased )
|
||||||
{
|
{
|
||||||
#ifdef CV_HAAR_USE_AVX
|
#if CV_HAAR_USE_AVX
|
||||||
if(haveAVX)
|
if(haveAVX)
|
||||||
{
|
{
|
||||||
CvHidHaarClassifier* classifiers[8];
|
|
||||||
CvHidHaarTreeNode* nodes[8];
|
|
||||||
for( i = start_stage; i < cascade->count; i++ )
|
for( i = start_stage; i < cascade->count; i++ )
|
||||||
{
|
{
|
||||||
stage_sum = 0.0;
|
stage_sum = 0.0;
|
||||||
j = 0;
|
j = 0;
|
||||||
float CV_DECL_ALIGNED(32) buf[8];
|
|
||||||
if( cascade->stage_classifier[i].two_rects )
|
if( cascade->stage_classifier[i].two_rects )
|
||||||
{
|
{
|
||||||
for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
|
for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
|
||||||
{
|
{
|
||||||
classifiers[0] = cascade->stage_classifier[i].classifier + j;
|
stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierTwoRectAVX(
|
||||||
nodes[0] = classifiers[0]->node;
|
cascade->stage_classifier[i].classifier + j,
|
||||||
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
|
variance_norm_factor, p_offset);
|
||||||
nodes[1] = classifiers[1]->node;
|
|
||||||
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
|
|
||||||
nodes[2] = classifiers[2]->node;
|
|
||||||
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
|
|
||||||
nodes[3] = classifiers[3]->node;
|
|
||||||
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
|
|
||||||
nodes[4] = classifiers[4]->node;
|
|
||||||
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
|
|
||||||
nodes[5] = classifiers[5]->node;
|
|
||||||
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
|
|
||||||
nodes[6] = classifiers[6]->node;
|
|
||||||
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
|
|
||||||
nodes[7] = classifiers[7]->node;
|
|
||||||
|
|
||||||
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
|
||||||
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
|
||||||
nodes[6]->threshold,
|
|
||||||
nodes[5]->threshold,
|
|
||||||
nodes[4]->threshold,
|
|
||||||
nodes[3]->threshold,
|
|
||||||
nodes[2]->threshold,
|
|
||||||
nodes[1]->threshold,
|
|
||||||
nodes[0]->threshold));
|
|
||||||
|
|
||||||
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
|
||||||
|
|
||||||
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
|
||||||
nodes[6]->feature.rect[0].weight,
|
|
||||||
nodes[5]->feature.rect[0].weight,
|
|
||||||
nodes[4]->feature.rect[0].weight,
|
|
||||||
nodes[3]->feature.rect[0].weight,
|
|
||||||
nodes[2]->feature.rect[0].weight,
|
|
||||||
nodes[1]->feature.rect[0].weight,
|
|
||||||
nodes[0]->feature.rect[0].weight);
|
|
||||||
|
|
||||||
__m256 sum = _mm256_mul_ps(offset, weight);
|
|
||||||
|
|
||||||
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
|
||||||
|
|
||||||
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
|
||||||
nodes[6]->feature.rect[1].weight,
|
|
||||||
nodes[5]->feature.rect[1].weight,
|
|
||||||
nodes[4]->feature.rect[1].weight,
|
|
||||||
nodes[3]->feature.rect[1].weight,
|
|
||||||
nodes[2]->feature.rect[1].weight,
|
|
||||||
nodes[1]->feature.rect[1].weight,
|
|
||||||
nodes[0]->feature.rect[1].weight);
|
|
||||||
|
|
||||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
|
|
||||||
|
|
||||||
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
|
|
||||||
classifiers[6]->alpha[0],
|
|
||||||
classifiers[5]->alpha[0],
|
|
||||||
classifiers[4]->alpha[0],
|
|
||||||
classifiers[3]->alpha[0],
|
|
||||||
classifiers[2]->alpha[0],
|
|
||||||
classifiers[1]->alpha[0],
|
|
||||||
classifiers[0]->alpha[0]);
|
|
||||||
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
|
|
||||||
classifiers[6]->alpha[1],
|
|
||||||
classifiers[5]->alpha[1],
|
|
||||||
classifiers[4]->alpha[1],
|
|
||||||
classifiers[3]->alpha[1],
|
|
||||||
classifiers[2]->alpha[1],
|
|
||||||
classifiers[1]->alpha[1],
|
|
||||||
classifiers[0]->alpha[1]);
|
|
||||||
|
|
||||||
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
|
|
||||||
stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; j < cascade->stage_classifier[i].count; j++ )
|
for( ; j < cascade->stage_classifier[i].count; j++ )
|
||||||
@ -1018,117 +776,9 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
|||||||
{
|
{
|
||||||
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
|
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
|
||||||
{
|
{
|
||||||
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
|
stage_sum += cv_haar_avx::icvEvalHidHaarStumpClassifierAVX(
|
||||||
|
cascade->stage_classifier[i].classifier + j,
|
||||||
classifiers[0] = cascade->stage_classifier[i].classifier + j;
|
variance_norm_factor, p_offset);
|
||||||
nodes[0] = classifiers[0]->node;
|
|
||||||
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
|
|
||||||
nodes[1] = classifiers[1]->node;
|
|
||||||
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
|
|
||||||
nodes[2] = classifiers[2]->node;
|
|
||||||
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
|
|
||||||
nodes[3] = classifiers[3]->node;
|
|
||||||
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
|
|
||||||
nodes[4] = classifiers[4]->node;
|
|
||||||
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
|
|
||||||
nodes[5] = classifiers[5]->node;
|
|
||||||
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
|
|
||||||
nodes[6] = classifiers[6]->node;
|
|
||||||
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
|
|
||||||
nodes[7] = classifiers[7]->node;
|
|
||||||
|
|
||||||
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
|
|
||||||
|
|
||||||
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
|
|
||||||
nodes[6]->threshold,
|
|
||||||
nodes[5]->threshold,
|
|
||||||
nodes[4]->threshold,
|
|
||||||
nodes[3]->threshold,
|
|
||||||
nodes[2]->threshold,
|
|
||||||
nodes[1]->threshold,
|
|
||||||
nodes[0]->threshold));
|
|
||||||
|
|
||||||
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[0], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[0], p_offset));
|
|
||||||
|
|
||||||
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
|
|
||||||
nodes[6]->feature.rect[0].weight,
|
|
||||||
nodes[5]->feature.rect[0].weight,
|
|
||||||
nodes[4]->feature.rect[0].weight,
|
|
||||||
nodes[3]->feature.rect[0].weight,
|
|
||||||
nodes[2]->feature.rect[0].weight,
|
|
||||||
nodes[1]->feature.rect[0].weight,
|
|
||||||
nodes[0]->feature.rect[0].weight);
|
|
||||||
|
|
||||||
__m256 sum = _mm256_mul_ps(offset, weight);
|
|
||||||
|
|
||||||
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[6]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[5]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[4]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[3]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[2]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[1]->feature.rect[1], p_offset),
|
|
||||||
calc_sumf(nodes[0]->feature.rect[1], p_offset));
|
|
||||||
|
|
||||||
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
|
|
||||||
nodes[6]->feature.rect[1].weight,
|
|
||||||
nodes[5]->feature.rect[1].weight,
|
|
||||||
nodes[4]->feature.rect[1].weight,
|
|
||||||
nodes[3]->feature.rect[1].weight,
|
|
||||||
nodes[2]->feature.rect[1].weight,
|
|
||||||
nodes[1]->feature.rect[1].weight,
|
|
||||||
nodes[0]->feature.rect[1].weight);
|
|
||||||
|
|
||||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
|
|
||||||
|
|
||||||
if( nodes[0]->feature.rect[2].p0 )
|
|
||||||
tmp[0] = calc_sumf(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
|
|
||||||
if( nodes[1]->feature.rect[2].p0 )
|
|
||||||
tmp[1] = calc_sumf(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
|
|
||||||
if( nodes[2]->feature.rect[2].p0 )
|
|
||||||
tmp[2] = calc_sumf(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
|
|
||||||
if( nodes[3]->feature.rect[2].p0 )
|
|
||||||
tmp[3] = calc_sumf(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
|
|
||||||
if( nodes[4]->feature.rect[2].p0 )
|
|
||||||
tmp[4] = calc_sumf(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
|
|
||||||
if( nodes[5]->feature.rect[2].p0 )
|
|
||||||
tmp[5] = calc_sumf(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
|
|
||||||
if( nodes[6]->feature.rect[2].p0 )
|
|
||||||
tmp[6] = calc_sumf(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
|
|
||||||
if( nodes[7]->feature.rect[2].p0 )
|
|
||||||
tmp[7] = calc_sumf(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
|
|
||||||
|
|
||||||
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
|
|
||||||
|
|
||||||
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
|
|
||||||
classifiers[6]->alpha[0],
|
|
||||||
classifiers[5]->alpha[0],
|
|
||||||
classifiers[4]->alpha[0],
|
|
||||||
classifiers[3]->alpha[0],
|
|
||||||
classifiers[2]->alpha[0],
|
|
||||||
classifiers[1]->alpha[0],
|
|
||||||
classifiers[0]->alpha[0]);
|
|
||||||
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
|
|
||||||
classifiers[6]->alpha[1],
|
|
||||||
classifiers[5]->alpha[1],
|
|
||||||
classifiers[4]->alpha[1],
|
|
||||||
classifiers[3]->alpha[1],
|
|
||||||
classifiers[2]->alpha[1],
|
|
||||||
classifiers[1]->alpha[1],
|
|
||||||
classifiers[0]->alpha[1]);
|
|
||||||
|
|
||||||
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
|
|
||||||
outBuf = _mm256_hadd_ps(outBuf, outBuf);
|
|
||||||
outBuf = _mm256_hadd_ps(outBuf, outBuf);
|
|
||||||
_mm256_store_ps(buf, outBuf);
|
|
||||||
stage_sum += (buf[0] + buf[4]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; j < cascade->stage_classifier[i].count; j++ )
|
for( ; j < cascade->stage_classifier[i].count; j++ )
|
||||||
@ -1241,14 +891,14 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
|
|||||||
stage_sum = 0.0;
|
stage_sum = 0.0;
|
||||||
int k = 0;
|
int k = 0;
|
||||||
|
|
||||||
#ifdef CV_HAAR_USE_AVX
|
#if CV_HAAR_USE_AVX
|
||||||
if(haveAVX)
|
if(haveAVX)
|
||||||
{
|
{
|
||||||
for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
|
for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
|
||||||
{
|
{
|
||||||
stage_sum += icvEvalHidHaarClassifierAVX(
|
stage_sum += cv_haar_avx::icvEvalHidHaarClassifierAVX(
|
||||||
cascade->stage_classifier[i].classifier + k,
|
cascade->stage_classifier[i].classifier + k,
|
||||||
variance_norm_factor, p_offset );
|
variance_norm_factor, p_offset );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
101
modules/objdetect/src/haar.hpp
Normal file
101
modules/objdetect/src/haar.hpp
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Intel License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
|
/* Haar features calculation */
|
||||||
|
|
||||||
|
#ifndef OPENCV_OBJDETECT_HAAR_HPP
|
||||||
|
#define OPENCV_OBJDETECT_HAAR_HPP
|
||||||
|
|
||||||
|
#define CV_HAAR_FEATURE_MAX_LOCAL 3
|
||||||
|
|
||||||
|
typedef int sumtype;
|
||||||
|
typedef double sqsumtype;
|
||||||
|
|
||||||
|
typedef struct CvHidHaarFeature
|
||||||
|
{
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
sumtype *p0, *p1, *p2, *p3;
|
||||||
|
float weight;
|
||||||
|
}
|
||||||
|
rect[CV_HAAR_FEATURE_MAX_LOCAL];
|
||||||
|
} CvHidHaarFeature;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct CvHidHaarTreeNode
|
||||||
|
{
|
||||||
|
CvHidHaarFeature feature;
|
||||||
|
float threshold;
|
||||||
|
int left;
|
||||||
|
int right;
|
||||||
|
} CvHidHaarTreeNode;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct CvHidHaarClassifier
|
||||||
|
{
|
||||||
|
int count;
|
||||||
|
//CvHaarFeature* orig_feature;
|
||||||
|
CvHidHaarTreeNode* node;
|
||||||
|
float* alpha;
|
||||||
|
} CvHidHaarClassifier;
|
||||||
|
|
||||||
|
#define calc_sumf(rect,offset) \
|
||||||
|
static_cast<float>((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
|
||||||
|
|
||||||
|
namespace cv_haar_avx
|
||||||
|
{
|
||||||
|
#if 0 /*CV_TRY_AVX*/
|
||||||
|
#define CV_HAAR_USE_AVX 1
|
||||||
|
#else
|
||||||
|
#define CV_HAAR_USE_AVX 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if CV_HAAR_USE_AVX
|
||||||
|
// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
|
||||||
|
double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
|
||||||
|
double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
|
||||||
|
double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier, double variance_norm_factor, size_t p_offset);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* End of file. */
|
Loading…
Reference in New Issue
Block a user