mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
neon: add dotprod dispatch implementation
* read vector at runtime * add enum
This commit is contained in:
parent
2a82467a6f
commit
b3269b08a1
@ -46,7 +46,7 @@
|
|||||||
|
|
||||||
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
|
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
|
||||||
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
|
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
|
||||||
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
|
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
|
||||||
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
|
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
|
||||||
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
|
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
|
||||||
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
|
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
|
||||||
@ -326,6 +326,7 @@ if(X86 OR X86_64)
|
|||||||
elseif(ARM OR AARCH64)
|
elseif(ARM OR AARCH64)
|
||||||
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
|
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
|
||||||
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
|
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
|
||||||
|
ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
|
||||||
if(NOT AARCH64)
|
if(NOT AARCH64)
|
||||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
|
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
|
||||||
if(NOT MSVC)
|
if(NOT MSVC)
|
||||||
@ -337,9 +338,11 @@ elseif(ARM OR AARCH64)
|
|||||||
endif()
|
endif()
|
||||||
ocv_update(CPU_FP16_IMPLIES "NEON")
|
ocv_update(CPU_FP16_IMPLIES "NEON")
|
||||||
else()
|
else()
|
||||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
|
ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
|
||||||
ocv_update(CPU_NEON_FLAGS_ON "")
|
ocv_update(CPU_NEON_FLAGS_ON "")
|
||||||
ocv_update(CPU_FP16_IMPLIES "NEON")
|
ocv_update(CPU_FP16_IMPLIES "NEON")
|
||||||
|
ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
|
||||||
|
ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
|
||||||
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
|
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||||
endif()
|
endif()
|
||||||
elseif(MIPS)
|
elseif(MIPS)
|
||||||
|
24
cmake/checks/cpu_dotprod.cpp
Normal file
24
cmake/checks/cpu_dotprod.cpp
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||||
|
#include "arm_neon.h"
|
||||||
|
int test()
|
||||||
|
{
|
||||||
|
const unsigned int src[] = { 0, 0, 0, 0 };
|
||||||
|
unsigned int dst[4];
|
||||||
|
uint32x4_t v_src = *(uint32x4_t*)src;
|
||||||
|
uint8x16_t v_m0 = *(uint8x16_t*)src;
|
||||||
|
uint8x16_t v_m1 = *(uint8x16_t*)src;
|
||||||
|
uint32x4_t v_dst = vdotq_u32(v_src, v_m0, v_m1);
|
||||||
|
*(uint32x4_t*)dst = v_dst;
|
||||||
|
return (int)dst[0];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#error "DOTPROD is not supported"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
printf("%d\n", test());
|
||||||
|
return 0;
|
||||||
|
}
|
@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
|
|||||||
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
|
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
|
||||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
||||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
||||||
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
|
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
|
||||||
ocv_add_dispatched_file(mean SSE2 AVX2)
|
ocv_add_dispatched_file(mean SSE2 AVX2)
|
||||||
ocv_add_dispatched_file(merge SSE2 AVX2)
|
ocv_add_dispatched_file(merge SSE2 AVX2)
|
||||||
ocv_add_dispatched_file(split SSE2 AVX2)
|
ocv_add_dispatched_file(split SSE2 AVX2)
|
||||||
|
@ -79,6 +79,10 @@
|
|||||||
# endif
|
# endif
|
||||||
# define CV_FP16 1
|
# define CV_FP16 1
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CV_CPU_COMPILE_NEON_DOTPROD
|
||||||
|
# include <arm_neon.h>
|
||||||
|
# define CV_NEON_DOT 1
|
||||||
|
#endif
|
||||||
#ifdef CV_CPU_COMPILE_AVX2
|
#ifdef CV_CPU_COMPILE_AVX2
|
||||||
# include <immintrin.h>
|
# include <immintrin.h>
|
||||||
# define CV_AVX2 1
|
# define CV_AVX2 1
|
||||||
|
@ -420,6 +420,27 @@
|
|||||||
#endif
|
#endif
|
||||||
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||||
|
|
||||||
|
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
|
||||||
|
# define CV_TRY_NEON_DOTPROD 1
|
||||||
|
# define CV_CPU_FORCE_NEON_DOTPROD 1
|
||||||
|
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
|
||||||
|
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
|
||||||
|
# define CV_TRY_NEON_DOTPROD 1
|
||||||
|
# define CV_CPU_FORCE_NEON_DOTPROD 0
|
||||||
|
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
|
||||||
|
#else
|
||||||
|
# define CV_TRY_NEON_DOTPROD 0
|
||||||
|
# define CV_CPU_FORCE_NEON_DOTPROD 0
|
||||||
|
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD(fn, args)
|
||||||
|
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
|
||||||
|
#endif
|
||||||
|
#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...) CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||||
|
|
||||||
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
|
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
|
||||||
# define CV_TRY_MSA 1
|
# define CV_TRY_MSA 1
|
||||||
# define CV_CPU_FORCE_MSA 1
|
# define CV_CPU_FORCE_MSA 1
|
||||||
|
@ -282,6 +282,7 @@ namespace cv {
|
|||||||
#define CV_CPU_AVX_5124FMAPS 27
|
#define CV_CPU_AVX_5124FMAPS 27
|
||||||
|
|
||||||
#define CV_CPU_NEON 100
|
#define CV_CPU_NEON 100
|
||||||
|
#define CV_CPU_NEON_DOTPROD 101
|
||||||
|
|
||||||
#define CV_CPU_MSA 150
|
#define CV_CPU_MSA 150
|
||||||
|
|
||||||
@ -334,6 +335,7 @@ enum CpuFeatures {
|
|||||||
CPU_AVX_5124FMAPS = 27,
|
CPU_AVX_5124FMAPS = 27,
|
||||||
|
|
||||||
CPU_NEON = 100,
|
CPU_NEON = 100,
|
||||||
|
CPU_NEON_DOTPROD = 101,
|
||||||
|
|
||||||
CPU_MSA = 150,
|
CPU_MSA = 150,
|
||||||
|
|
||||||
|
@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
|||||||
#define CV_NEON_AARCH64 0
|
#define CV_NEON_AARCH64 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO
|
|
||||||
#define CV_NEON_DOT 0
|
|
||||||
|
|
||||||
//////////// Utils ////////////
|
//////////// Utils ////////////
|
||||||
|
|
||||||
@ -667,11 +665,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 8 >> 32
|
// 8 >> 32
|
||||||
|
#ifdef CV_NEON_DOT
|
||||||
|
#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
|
||||||
|
inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
|
||||||
|
{ \
|
||||||
|
return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
|
||||||
|
} \
|
||||||
|
inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
|
||||||
|
{ \
|
||||||
|
return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
|
||||||
|
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4, v_int8x16, s32)
|
||||||
|
#else
|
||||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
|
|
||||||
#else
|
|
||||||
const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
|
const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
|
||||||
const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
|
const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
|
||||||
const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
|
const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
|
||||||
@ -687,23 +696,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
|
|||||||
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
|
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
|
||||||
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
|
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
|
||||||
return v_uint32x4(vaddq_u32(s0, s1));
|
return v_uint32x4(vaddq_u32(s0, s1));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
|
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
|
||||||
const v_uint32x4& c)
|
const v_uint32x4& c)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
|
|
||||||
#else
|
|
||||||
return v_dotprod_expand(a, b) + c;
|
return v_dotprod_expand(a, b) + c;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
|
|
||||||
#else
|
|
||||||
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||||
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||||
int16x8_t uzp1, uzp2;
|
int16x8_t uzp1, uzp2;
|
||||||
@ -712,18 +713,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
|
|||||||
int16x4_t uzpl1, uzpl2;
|
int16x4_t uzpl1, uzpl2;
|
||||||
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
|
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
|
||||||
return v_int32x4(vaddl_s16(uzpl1, uzpl2));
|
return v_int32x4(vaddl_s16(uzpl1, uzpl2));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
|
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
|
||||||
const v_int32x4& c)
|
const v_int32x4& c)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
|
|
||||||
#else
|
|
||||||
return v_dotprod_expand(a, b) + c;
|
return v_dotprod_expand(a, b) + c;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
// 16 >> 64
|
// 16 >> 64
|
||||||
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
|
||||||
{
|
{
|
||||||
@ -832,45 +828,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 8 >> 32
|
// 8 >> 32
|
||||||
|
#ifdef CV_NEON_DOT
|
||||||
|
#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
|
||||||
|
inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
|
||||||
|
{ \
|
||||||
|
return v_dotprod_expand(a, b); \
|
||||||
|
} \
|
||||||
|
inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
|
||||||
|
{ \
|
||||||
|
return v_dotprod_expand(a, b, c); \
|
||||||
|
}
|
||||||
|
|
||||||
|
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
|
||||||
|
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4, v_int8x16, s32)
|
||||||
|
#else
|
||||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
|
|
||||||
#else
|
|
||||||
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
|
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
|
||||||
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
|
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
|
||||||
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
|
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
|
||||||
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
|
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
|
||||||
return v_uint32x4(vaddq_u32(s0, s1));
|
return v_uint32x4(vaddq_u32(s0, s1));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
|
|
||||||
#else
|
|
||||||
return v_dotprod_expand_fast(a, b) + c;
|
return v_dotprod_expand_fast(a, b) + c;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
|
|
||||||
#else
|
|
||||||
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||||
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
|
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
|
||||||
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
|
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||||
{
|
{
|
||||||
#if CV_NEON_DOT
|
|
||||||
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
|
|
||||||
#else
|
|
||||||
return v_dotprod_expand_fast(a, b) + c;
|
return v_dotprod_expand_fast(a, b) + c;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// 16 >> 64
|
// 16 >> 64
|
||||||
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
|
||||||
|
@ -411,6 +411,7 @@ struct HWFeatures
|
|||||||
g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS";
|
g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS";
|
||||||
|
|
||||||
g_hwFeatureNames[CPU_NEON] = "NEON";
|
g_hwFeatureNames[CPU_NEON] = "NEON";
|
||||||
|
g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
|
||||||
|
|
||||||
g_hwFeatureNames[CPU_VSX] = "VSX";
|
g_hwFeatureNames[CPU_VSX] = "VSX";
|
||||||
g_hwFeatureNames[CPU_VSX3] = "VSX3";
|
g_hwFeatureNames[CPU_VSX3] = "VSX3";
|
||||||
@ -555,6 +556,24 @@ struct HWFeatures
|
|||||||
#ifdef __aarch64__
|
#ifdef __aarch64__
|
||||||
have[CV_CPU_NEON] = true;
|
have[CV_CPU_NEON] = true;
|
||||||
have[CV_CPU_FP16] = true;
|
have[CV_CPU_FP16] = true;
|
||||||
|
int cpufile = open("/proc/self/auxv", O_RDONLY);
|
||||||
|
|
||||||
|
if (cpufile >= 0)
|
||||||
|
{
|
||||||
|
Elf64_auxv_t auxv;
|
||||||
|
const size_t size_auxv_t = sizeof(auxv);
|
||||||
|
|
||||||
|
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
|
||||||
|
{
|
||||||
|
if (auxv.a_type == AT_HWCAP)
|
||||||
|
{
|
||||||
|
have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
close(cpufile);
|
||||||
|
}
|
||||||
#elif defined __arm__ && defined __ANDROID__
|
#elif defined __arm__ && defined __ANDROID__
|
||||||
#if defined HAVE_CPUFEATURES
|
#if defined HAVE_CPUFEATURES
|
||||||
CV_LOG_INFO(NULL, "calling android_getCpuFeatures() ...");
|
CV_LOG_INFO(NULL, "calling android_getCpuFeatures() ...");
|
||||||
|
Loading…
Reference in New Issue
Block a user