mirror of
https://github.com/opencv/opencv.git
synced 2025-06-10 11:03:03 +08:00
Merge pull request #14110 from seiko2plus:core_vsx_fp16
This commit is contained in:
commit
4e60db9030
@ -294,11 +294,18 @@ endif()
|
||||
# workaround gcc bug for aligned ld/st
|
||||
# https://github.com/opencv/opencv/issues/13211
|
||||
if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED)
|
||||
ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" "OPENCV_CHECK_VSX_ALIGNED" "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
|
||||
ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ALIGNED "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
|
||||
if(NOT OPENCV_CHECK_VSX_ALIGNED)
|
||||
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED)
|
||||
endif()
|
||||
endif()
|
||||
# validate inline asm with fixes register number and constraints wa, wd, wf
|
||||
if(PPC64LE)
|
||||
ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ASM "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx_asm.cpp")
|
||||
if(NOT OPENCV_CHECK_VSX_ASM)
|
||||
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ASM)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# combine all "extra" options
|
||||
if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS)
|
||||
|
21
cmake/checks/cpu_vsx_asm.cpp
Normal file
21
cmake/checks/cpu_vsx_asm.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
#if defined(__VSX__)
|
||||
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
|
||||
#include <altivec.h>
|
||||
#else
|
||||
#error "OpenCV only supports little-endian mode"
|
||||
#endif
|
||||
#else
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* xlc and wide versions of clang don't support %x<n> in the inline asm template which fixes register number
|
||||
* when using any of the register constraints wa, wd, wf
|
||||
*/
|
||||
int main()
|
||||
{
|
||||
__vector float vf;
|
||||
__vector signed int vi;
|
||||
__asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi));
|
||||
return 0;
|
||||
}
|
@ -3,7 +3,7 @@ set(the_description "The Core Functionality")
|
||||
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
|
||||
ocv_add_dispatched_file(stat SSE4_2 AVX2)
|
||||
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
||||
ocv_add_dispatched_file(matmul SSE2 AVX2)
|
||||
|
@ -11,11 +11,6 @@
|
||||
#define CV_SIMD128 1
|
||||
#define CV_SIMD128_64F 1
|
||||
|
||||
/**
|
||||
* todo: supporting half precision for power9
|
||||
* convert instractions xvcvhpsp, xvcvsphp
|
||||
**/
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
|
||||
|
||||
/////// FP16 support ////////
|
||||
|
||||
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
|
||||
inline v_float32x4 v_load_expand(const float16_t* ptr)
|
||||
{
|
||||
return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
|
||||
vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
|
||||
#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
|
||||
return v_float32x4(vec_extract_fp_from_shorth(vf16));
|
||||
#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
|
||||
vec_float4 vf32;
|
||||
__asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
|
||||
return v_float32x4(vf32);
|
||||
#else
|
||||
const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
|
||||
const vec_int4 signmask = vec_int4_sp(0x80000000);
|
||||
const vec_int4 maxexp = vec_int4_sp(0x7c000000);
|
||||
const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
|
||||
|
||||
vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
|
||||
vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
|
||||
vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
|
||||
vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
|
||||
|
||||
t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
|
||||
vec_bint4 zmask = vec_cmpeq(e, z);
|
||||
vec_int4 ft = vec_sel(t, zt, zmask);
|
||||
return v_float32x4(vec_float4_c(vec_or(ft, sign)));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
|
||||
{
|
||||
float CV_DECL_ALIGNED(32) f[4];
|
||||
v_store_aligned(f, v);
|
||||
ptr[0] = float16_t(f[0]);
|
||||
ptr[1] = float16_t(f[1]);
|
||||
ptr[2] = float16_t(f[2]);
|
||||
ptr[3] = float16_t(f[3]);
|
||||
// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
|
||||
#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
|
||||
vec_ushort8 vf16;
|
||||
__asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
|
||||
vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
|
||||
#else
|
||||
const vec_int4 signmask = vec_int4_sp(0x80000000);
|
||||
const vec_int4 rval = vec_int4_sp(0x3f000000);
|
||||
|
||||
vec_int4 t = vec_int4_c(v.val);
|
||||
vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
|
||||
t = vec_and(vec_nor(signmask, signmask), t);
|
||||
|
||||
vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
|
||||
vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
|
||||
vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
|
||||
vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
|
||||
vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
|
||||
tt = vec_sub(tt, rval);
|
||||
vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
|
||||
vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
|
||||
nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
|
||||
t = vec_sel(nt, tt, tinymask);
|
||||
t = vec_sel(naninf, t, finitemask);
|
||||
t = vec_or(t, sign);
|
||||
vec_st_l8(vec_packs(t, t), ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
|
||||
*
|
||||
* So we're not able to use inline asm and only use built-in functions that CLANG supports
|
||||
* and use __builtin_convertvector if clang missng any of vector conversions built-in functions
|
||||
*
|
||||
* todo: clang asm template bug is fixed, need to reconsider the current workarounds.
|
||||
*/
|
||||
|
||||
// convert vector helper
|
||||
|
Loading…
Reference in New Issue
Block a user