use universal intrinsic in FAST

This commit is contained in:
Tomoaki Teshima 2017-09-21 18:28:15 +09:00
parent 6a5298a532
commit bf718b0865

View File

@ -42,7 +42,7 @@ The references are:
*/ */
#include "fast_score.hpp" #include "fast_score.hpp"
#include "opencv2/core/hal/intrin.hpp"
#define VERIFY_CORNERS 0 #define VERIFY_CORNERS 0
namespace cv { namespace cv {
@ -125,45 +125,48 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
for( k = 0; k < N; k++ ) for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); if (hasSIMD128())
{
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
for (k = 0; k < 16; k += 8) for (k = 0; k < 16; k += 8)
{ {
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); v_int16x8 v0 = v_load(d + k + 1);
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); v_int16x8 v1 = v_load(d + k + 2);
__m128i a = _mm_min_epi16(v0, v1); v_int16x8 a = v_min(v0, v1);
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 b = v_max(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3)); v0 = v_load(d + k + 3);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+4)); v0 = v_load(d + k + 4);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5)); v0 = v_load(d + k + 5);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6)); v0 = v_load(d + k + 6);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+7)); v0 = v_load(d + k + 7);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+8)); v0 = v_load(d + k + 8);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k)); v0 = v_load(d + k);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
v0 = _mm_loadu_si128((__m128i*)(d+k+9)); v0 = v_load(d + k + 9);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
} }
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = v_max(q0, v_setzero_s16() - q1);
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); threshold = v_reduce_max(q0) - 1;
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); }
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); else
threshold = (short)_mm_cvtsi128_si32(q0) - 1; #endif
#else {
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 16; k += 2 ) for( k = 0; k < 16; k += 2 )
{ {
@ -198,7 +201,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
} }
threshold = -b0 - 1; threshold = -b0 - 1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);
@ -214,44 +217,46 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
short d[N + 4]; short d[N + 4];
for( k = 0; k < N; k++ ) for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
for( k = 0; k < 4; k++ ) for( k = 0; k < 4; k++ )
d[N+k] = d[k]; d[N+k] = d[k];
#endif #endif
#if CV_SSE2 #if CV_SIMD128
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); if (hasSIMD128())
{
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
for (k = 0; k < 16; k += 8) for (k = 0; k < 16; k += 8)
{ {
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); v_int16x8 v0 = v_load(d + k + 1);
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); v_int16x8 v1 = v_load(d + k + 2);
__m128i a = _mm_min_epi16(v0, v1); v_int16x8 a = v_min(v0, v1);
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 b = v_max(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3)); v0 = v_load(d + k + 3);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+4)); v0 = v_load(d + k + 4);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5)); v0 = v_load(d + k + 5);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6)); v0 = v_load(d + k + 6);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k)); v0 = v_load(d + k);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
v0 = _mm_loadu_si128((__m128i*)(d+k+7)); v0 = v_load(d + k + 7);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
} }
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = v_max(q0, v_setzero_s16() - q1);
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); threshold = v_reduce_max(q0) - 1;
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); }
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); else
threshold = (short)_mm_cvtsi128_si32(q0) - 1; #endif
#else {
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 12; k += 2 ) for( k = 0; k < 12; k += 2 )
{ {
@ -282,8 +287,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
} }
threshold = -b0-1; threshold = -b0-1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);
#endif #endif
@ -299,29 +303,31 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
for (k = 0; k < N; k++) for (k = 0; k < N; k++)
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
__m128i v0 = _mm_loadu_si128((__m128i*)(d+1)); if (hasSIMD128())
__m128i v1 = _mm_loadu_si128((__m128i*)(d+2)); {
__m128i a = _mm_min_epi16(v0, v1); v_int16x8 v0 = v_load(d + 1);
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 v1 = v_load(d + 2);
v0 = _mm_loadu_si128((__m128i*)(d+3)); v_int16x8 a = v_min(v0, v1);
a = _mm_min_epi16(a, v0); v_int16x8 b = v_max(v0, v1);
b = _mm_max_epi16(b, v0); v0 = v_load(d + 3);
v0 = _mm_loadu_si128((__m128i*)(d+4)); a = v_min(a, v0);
a = _mm_min_epi16(a, v0); b = v_max(b, v0);
b = _mm_max_epi16(b, v0); v0 = v_load(d + 4);
v0 = _mm_loadu_si128((__m128i*)(d)); a = v_min(a, v0);
__m128i q0 = _mm_min_epi16(a, v0); b = v_max(b, v0);
__m128i q1 = _mm_max_epi16(b, v0); v0 = v_load(d);
v0 = _mm_loadu_si128((__m128i*)(d+5)); v_int16x8 q0 = v_min(a, v0);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); v_int16x8 q1 = v_max(b, v0);
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); v0 = v_load(d + 5);
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = v_max(q0, v_min(a, v0));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q1 = v_min(q1, v_max(b, v0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = v_max(q0, v_setzero_s16() - q1);
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = v_reduce_max(q0) - 1;
threshold = (short)_mm_cvtsi128_si32(q0) - 1; }
#else else
#endif
{
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 8; k += 2 ) for( k = 0; k < 8; k += 2 )
{ {
@ -348,7 +354,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
} }
threshold = -b0-1; threshold = -b0-1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);