mirror of
https://github.com/opencv/opencv.git
synced 2025-01-21 08:37:57 +08:00
Merge pull request #9875 from terfendail:fast_avx
This commit is contained in:
commit
1a495a5817
184
modules/features2d/src/fast.avx2.cpp
Normal file
184
modules/features2d/src/fast.avx2.cpp
Normal file
@ -0,0 +1,184 @@
|
||||
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
|
||||
Below is the original copyright and the references */
|
||||
|
||||
/*
|
||||
Copyright (c) 2006, 2008 Edward Rosten
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
*Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
*Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
*Neither the name of the University of Cambridge nor the names of
|
||||
its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
The references are:
|
||||
* Machine learning for high-speed corner detection,
|
||||
E. Rosten and T. Drummond, ECCV 2006
|
||||
* Faster and better: A machine learning approach to corner detection
|
||||
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
|
||||
*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "fast.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace opt_AVX2
|
||||
{
|
||||
|
||||
class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2
|
||||
{
|
||||
public:
|
||||
FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel):
|
||||
cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel)
|
||||
{
|
||||
//patternSize = 16
|
||||
t256c = (char)_threshold;
|
||||
threshold = std::min(std::max(_threshold, 0), 255);
|
||||
}
|
||||
|
||||
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners)
|
||||
{
|
||||
static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8));
|
||||
const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c));
|
||||
for (; j < cols - 32 - 3; j += 32, ptr += 32)
|
||||
{
|
||||
__m256i m0, m1;
|
||||
__m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
|
||||
|
||||
__m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
|
||||
v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
|
||||
|
||||
__m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
|
||||
__m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
|
||||
__m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
|
||||
__m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
|
||||
|
||||
m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
|
||||
m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
|
||||
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
|
||||
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
|
||||
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
|
||||
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
|
||||
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
|
||||
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
|
||||
m0 = _mm256_or_si256(m0, m1);
|
||||
|
||||
unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
|
||||
if (mask == 0){
|
||||
continue;
|
||||
}
|
||||
if ((mask & 0xffff) == 0)
|
||||
{
|
||||
j -= 16;
|
||||
ptr -= 16;
|
||||
continue;
|
||||
}
|
||||
|
||||
__m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
|
||||
for (int k = 0; k < 25; k++)
|
||||
{
|
||||
__m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
|
||||
m0 = _mm256_cmpgt_epi8(x, v0);
|
||||
m1 = _mm256_cmpgt_epi8(v1, x);
|
||||
|
||||
c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
|
||||
c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
|
||||
|
||||
max0 = _mm256_max_epu8(max0, c0);
|
||||
max1 = _mm256_max_epu8(max1, c1);
|
||||
}
|
||||
|
||||
max0 = _mm256_max_epu8(max0, max1);
|
||||
unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
|
||||
|
||||
for (int k = 0; m > 0 && k < 32; k++, m >>= 1)
|
||||
if (m & 1)
|
||||
{
|
||||
cornerpos[ncorners++] = j + k;
|
||||
if (nonmax_suppression)
|
||||
{
|
||||
short d[25];
|
||||
for (int q = 0; q < 25; q++)
|
||||
d[q] = (short)(ptr[k] - ptr[k + pixel[q]]);
|
||||
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
|
||||
for (int q = 0; q < 16; q += 8)
|
||||
{
|
||||
v_int16x8 v0_ = v_load(d + q + 1);
|
||||
v_int16x8 v1_ = v_load(d + q + 2);
|
||||
v_int16x8 a = v_min(v0_, v1_);
|
||||
v_int16x8 b = v_max(v0_, v1_);
|
||||
v0_ = v_load(d + q + 3);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q + 4);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q + 5);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q + 6);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q + 7);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q + 8);
|
||||
a = v_min(a, v0_);
|
||||
b = v_max(b, v0_);
|
||||
v0_ = v_load(d + q);
|
||||
q0 = v_max(q0, v_min(a, v0_));
|
||||
q1 = v_min(q1, v_max(b, v0_));
|
||||
v0_ = v_load(d + q + 9);
|
||||
q0 = v_max(q0, v_min(a, v0_));
|
||||
q1 = v_min(q1, v_max(b, v0_));
|
||||
}
|
||||
q0 = v_max(q0, v_setzero_s16() - q1);
|
||||
curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
virtual ~FAST_t_patternSize16_AVX2_Impl() {};
|
||||
|
||||
private:
|
||||
int cols;
|
||||
char t256c;
|
||||
int threshold;
|
||||
bool nonmax_suppression;
|
||||
const int* pixel;
|
||||
};
|
||||
|
||||
Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel)
|
||||
{
|
||||
return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -42,6 +42,7 @@ The references are:
|
||||
*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "fast.hpp"
|
||||
#include "fast_score.hpp"
|
||||
#include "opencl_kernels_features2d.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
@ -59,13 +60,20 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
{
|
||||
Mat img = _img.getMat();
|
||||
const int K = patternSize/2, N = patternSize + K + 1;
|
||||
int i, j, k, pixel[25];
|
||||
makeOffsets(pixel, (int)img.step, patternSize);
|
||||
|
||||
#if CV_SIMD128
|
||||
const int quarterPatternSize = patternSize/4;
|
||||
v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
|
||||
bool hasSimd = hasSIMD128();
|
||||
#if CV_TRY_AVX2
|
||||
Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2;
|
||||
if(CV_CPU_HAS_SUPPORT_AVX2)
|
||||
fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
int i, j, k, pixel[25];
|
||||
makeOffsets(pixel, (int)img.step, patternSize);
|
||||
|
||||
keypoints.clear();
|
||||
|
||||
@ -100,65 +108,72 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
|
||||
{
|
||||
if( patternSize == 16 )
|
||||
{
|
||||
for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
|
||||
#if CV_TRY_AVX2
|
||||
if (fast_t_impl_avx2)
|
||||
fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners);
|
||||
#endif
|
||||
//vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements
|
||||
{
|
||||
v_uint8x16 v = v_load(ptr);
|
||||
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
|
||||
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
|
||||
|
||||
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
|
||||
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
|
||||
v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
|
||||
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
|
||||
|
||||
v_int8x16 m0, m1;
|
||||
m0 = (v0 < x0) & (v0 < x1);
|
||||
m1 = (x0 < v1) & (x1 < v1);
|
||||
m0 = m0 | ((v0 < x1) & (v0 < x2));
|
||||
m1 = m1 | ((x1 < v1) & (x2 < v1));
|
||||
m0 = m0 | ((v0 < x2) & (v0 < x3));
|
||||
m1 = m1 | ((x2 < v1) & (x3 < v1));
|
||||
m0 = m0 | ((v0 < x3) & (v0 < x0));
|
||||
m1 = m1 | ((x3 < v1) & (x0 < v1));
|
||||
m0 = m0 | m1;
|
||||
|
||||
int mask = v_signmask(m0);
|
||||
if( mask == 0 )
|
||||
continue;
|
||||
if( (mask & 255) == 0 )
|
||||
for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
|
||||
{
|
||||
j -= 8;
|
||||
ptr -= 8;
|
||||
continue;
|
||||
}
|
||||
v_uint8x16 v = v_load(ptr);
|
||||
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
|
||||
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
|
||||
|
||||
v_int8x16 c0 = v_setzero_s8();
|
||||
v_int8x16 c1 = v_setzero_s8();
|
||||
v_uint8x16 max0 = v_setzero_u8();
|
||||
v_uint8x16 max1 = v_setzero_u8();
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
|
||||
m0 = v0 < x;
|
||||
m1 = x < v1;
|
||||
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
|
||||
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
|
||||
v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
|
||||
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
|
||||
|
||||
c0 = v_sub_wrap(c0, m0) & m0;
|
||||
c1 = v_sub_wrap(c1, m1) & m1;
|
||||
v_int8x16 m0, m1;
|
||||
m0 = (v0 < x0) & (v0 < x1);
|
||||
m1 = (x0 < v1) & (x1 < v1);
|
||||
m0 = m0 | ((v0 < x1) & (v0 < x2));
|
||||
m1 = m1 | ((x1 < v1) & (x2 < v1));
|
||||
m0 = m0 | ((v0 < x2) & (v0 < x3));
|
||||
m1 = m1 | ((x2 < v1) & (x3 < v1));
|
||||
m0 = m0 | ((v0 < x3) & (v0 < x0));
|
||||
m1 = m1 | ((x3 < v1) & (x0 < v1));
|
||||
m0 = m0 | m1;
|
||||
|
||||
max0 = v_max(max0, v_reinterpret_as_u8(c0));
|
||||
max1 = v_max(max1, v_reinterpret_as_u8(c1));
|
||||
}
|
||||
|
||||
max0 = v_max(max0, max1);
|
||||
int m = v_signmask(K16 < max0);
|
||||
|
||||
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
|
||||
{
|
||||
if(m & 1)
|
||||
int mask = v_signmask(m0);
|
||||
if( mask == 0 )
|
||||
continue;
|
||||
if( (mask & 255) == 0 )
|
||||
{
|
||||
cornerpos[ncorners++] = j+k;
|
||||
if(nonmax_suppression)
|
||||
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
|
||||
j -= 8;
|
||||
ptr -= 8;
|
||||
continue;
|
||||
}
|
||||
|
||||
v_int8x16 c0 = v_setzero_s8();
|
||||
v_int8x16 c1 = v_setzero_s8();
|
||||
v_uint8x16 max0 = v_setzero_u8();
|
||||
v_uint8x16 max1 = v_setzero_u8();
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
|
||||
m0 = v0 < x;
|
||||
m1 = x < v1;
|
||||
|
||||
c0 = v_sub_wrap(c0, m0) & m0;
|
||||
c1 = v_sub_wrap(c1, m1) & m1;
|
||||
|
||||
max0 = v_max(max0, v_reinterpret_as_u8(c0));
|
||||
max1 = v_max(max1, v_reinterpret_as_u8(c1));
|
||||
}
|
||||
|
||||
max0 = v_max(max0, max1);
|
||||
int m = v_signmask(K16 < max0);
|
||||
|
||||
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
|
||||
{
|
||||
if(m & 1)
|
||||
{
|
||||
cornerpos[ncorners++] = j+k;
|
||||
if(nonmax_suppression)
|
||||
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
62
modules/features2d/src/fast.hpp
Normal file
62
modules/features2d/src/fast.hpp
Normal file
@ -0,0 +1,62 @@
|
||||
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
|
||||
Below is the original copyright and the references */
|
||||
|
||||
/*
|
||||
Copyright (c) 2006, 2008 Edward Rosten
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
*Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
*Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
*Neither the name of the University of Cambridge nor the names of
|
||||
its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
The references are:
|
||||
* Machine learning for high-speed corner detection,
|
||||
E. Rosten and T. Drummond, ECCV 2006
|
||||
* Faster and better: A machine learning approach to corner detection
|
||||
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
|
||||
*/
|
||||
|
||||
#ifndef OPENCV_FEATURES2D_FAST_HPP
|
||||
#define OPENCV_FEATURES2D_FAST_HPP
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace opt_AVX2
|
||||
{
|
||||
#if CV_TRY_AVX2
|
||||
class FAST_t_patternSize16_AVX2
|
||||
{
|
||||
public:
|
||||
static Ptr<FAST_t_patternSize16_AVX2> getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel);
|
||||
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) = 0;
|
||||
virtual ~FAST_t_patternSize16_AVX2() {};
|
||||
};
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user