mirror of
https://github.com/opencv/opencv.git
synced 2024-11-24 03:00:14 +08:00
Merge pull request #24271 from Kumataro:fix24163
Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271 Fix https://github.com/opencv/opencv/issues/24163 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake (carotene is BSD)
This commit is contained in:
parent
d9d402916a
commit
dba7186378
8
3rdparty/carotene/CMakeLists.txt
vendored
8
3rdparty/carotene/CMakeLists.txt
vendored
@ -42,6 +42,14 @@ endif()
|
|||||||
|
|
||||||
if(WITH_NEON)
|
if(WITH_NEON)
|
||||||
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
|
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
|
||||||
|
if(NOT DEFINED CAROTENE_NEON_ARCH )
|
||||||
|
elseif(CAROTENE_NEON_ARCH EQUAL 8)
|
||||||
|
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
|
||||||
|
elseif(CAROTENE_NEON_ARCH EQUAL 7)
|
||||||
|
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
|
||||||
|
else()
|
||||||
|
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# we add dummy file to fix XCode build
|
# we add dummy file to fix XCode build
|
||||||
|
13
3rdparty/carotene/src/add_weighted.cpp
vendored
13
3rdparty/carotene/src/add_weighted.cpp
vendored
@ -39,6 +39,7 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
#include "vtransform.hpp"
|
#include "vtransform.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
namespace CAROTENE_NS {
|
namespace CAROTENE_NS {
|
||||||
|
|
||||||
@ -106,7 +107,7 @@ template <> struct wAdd<s32>
|
|||||||
{
|
{
|
||||||
valpha = vdupq_n_f32(_alpha);
|
valpha = vdupq_n_f32(_alpha);
|
||||||
vbeta = vdupq_n_f32(_beta);
|
vbeta = vdupq_n_f32(_beta);
|
||||||
vgamma = vdupq_n_f32(_gamma + 0.5);
|
vgamma = vdupq_n_f32(_gamma);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const VecTraits<s32>::vec128 & v_src0,
|
void operator() (const VecTraits<s32>::vec128 & v_src0,
|
||||||
@ -118,7 +119,7 @@ template <> struct wAdd<s32>
|
|||||||
|
|
||||||
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
||||||
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
||||||
v_dst = vcvtq_s32_f32(vs1);
|
v_dst = vroundq_s32_f32(vs1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const VecTraits<s32>::vec64 & v_src0,
|
void operator() (const VecTraits<s32>::vec64 & v_src0,
|
||||||
@ -130,7 +131,7 @@ template <> struct wAdd<s32>
|
|||||||
|
|
||||||
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
||||||
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
||||||
v_dst = vcvt_s32_f32(vs1);
|
v_dst = vround_s32_f32(vs1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
|
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
|
||||||
@ -150,7 +151,7 @@ template <> struct wAdd<u32>
|
|||||||
{
|
{
|
||||||
valpha = vdupq_n_f32(_alpha);
|
valpha = vdupq_n_f32(_alpha);
|
||||||
vbeta = vdupq_n_f32(_beta);
|
vbeta = vdupq_n_f32(_beta);
|
||||||
vgamma = vdupq_n_f32(_gamma + 0.5);
|
vgamma = vdupq_n_f32(_gamma);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const VecTraits<u32>::vec128 & v_src0,
|
void operator() (const VecTraits<u32>::vec128 & v_src0,
|
||||||
@ -162,7 +163,7 @@ template <> struct wAdd<u32>
|
|||||||
|
|
||||||
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
||||||
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
||||||
v_dst = vcvtq_u32_f32(vs1);
|
v_dst = vroundq_u32_f32(vs1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const VecTraits<u32>::vec64 & v_src0,
|
void operator() (const VecTraits<u32>::vec64 & v_src0,
|
||||||
@ -174,7 +175,7 @@ template <> struct wAdd<u32>
|
|||||||
|
|
||||||
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
||||||
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
||||||
v_dst = vcvt_u32_f32(vs1);
|
v_dst = vround_u32_f32(vs1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
|
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
|
||||||
|
15
3rdparty/carotene/src/blur.cpp
vendored
15
3rdparty/carotene/src/blur.cpp
vendored
@ -41,6 +41,7 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
#include "saturate_cast.hpp"
|
#include "saturate_cast.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
namespace CAROTENE_NS {
|
namespace CAROTENE_NS {
|
||||||
|
|
||||||
@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
|
|||||||
//#define FLOAT_VARIANT_1_9
|
//#define FLOAT_VARIANT_1_9
|
||||||
#ifdef FLOAT_VARIANT_1_9
|
#ifdef FLOAT_VARIANT_1_9
|
||||||
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
|
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
|
||||||
float32x4_t v0_5 = vdupq_n_f32 (.5);
|
|
||||||
#else
|
#else
|
||||||
const int16x8_t vScale = vmovq_n_s16(3640);
|
const int16x8_t vScale = vmovq_n_s16(3640);
|
||||||
#endif
|
#endif
|
||||||
@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
|
|||||||
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
||||||
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
|
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
|
||||||
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
|
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
|
||||||
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
|
tres1 = internal::vroundq_u32_f32(vf1);
|
||||||
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
|
tres2 = internal::vroundq_u32_f32(vf2);
|
||||||
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
||||||
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
||||||
#else
|
#else
|
||||||
@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
|
|||||||
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
||||||
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
|
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
|
||||||
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
|
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
|
||||||
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
|
tres1 = internal::vroundq_u32_f32(vf1);
|
||||||
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
|
tres2 = internal::vroundq_u32_f32(vf2);
|
||||||
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
||||||
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
||||||
#else
|
#else
|
||||||
@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
|
|||||||
#define FLOAT_VARIANT_1_25
|
#define FLOAT_VARIANT_1_25
|
||||||
#ifdef FLOAT_VARIANT_1_25
|
#ifdef FLOAT_VARIANT_1_25
|
||||||
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
|
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
|
||||||
float32x4_t v0_5 = vdupq_n_f32 (.5f);
|
|
||||||
#else
|
#else
|
||||||
const int16x8_t vScale = vmovq_n_s16(1310);
|
const int16x8_t vScale = vmovq_n_s16(1310);
|
||||||
#endif
|
#endif
|
||||||
@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
|
|||||||
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
|
||||||
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
|
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
|
||||||
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
|
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
|
||||||
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
|
tres1 = internal::vroundq_u32_f32(vf1);
|
||||||
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
|
tres2 = internal::vroundq_u32_f32(vf2);
|
||||||
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
|
||||||
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
vst1_u8(drow + x - 8, vmovn_u16(t0));
|
||||||
#else
|
#else
|
||||||
|
16
3rdparty/carotene/src/colorconvert.cpp
vendored
16
3rdparty/carotene/src/colorconvert.cpp
vendored
@ -40,6 +40,7 @@
|
|||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
|
||||||
#include "saturate_cast.hpp"
|
#include "saturate_cast.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
namespace CAROTENE_NS {
|
namespace CAROTENE_NS {
|
||||||
|
|
||||||
@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
|
|||||||
vSt3 = vmulq_f32(vHF1, vDivTab);
|
vSt3 = vmulq_f32(vHF1, vDivTab);
|
||||||
vSt4 = vmulq_f32(vHF2, vDivTab);
|
vSt4 = vmulq_f32(vHF2, vDivTab);
|
||||||
|
|
||||||
float32x4_t bias = vdupq_n_f32(0.5f);
|
uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
|
||||||
|
uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
|
||||||
vSt1 = vaddq_f32(vSt1, bias);
|
uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
|
||||||
vSt2 = vaddq_f32(vSt2, bias);
|
uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
|
||||||
vSt3 = vaddq_f32(vSt3, bias);
|
|
||||||
vSt4 = vaddq_f32(vSt4, bias);
|
|
||||||
|
|
||||||
uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
|
|
||||||
uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
|
|
||||||
uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
|
|
||||||
uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
|
|
||||||
|
|
||||||
int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
|
int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
|
||||||
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
|
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
|
||||||
|
11
3rdparty/carotene/src/common.hpp
vendored
11
3rdparty/carotene/src/common.hpp
vendored
@ -58,6 +58,17 @@
|
|||||||
|
|
||||||
namespace CAROTENE_NS { namespace internal {
|
namespace CAROTENE_NS { namespace internal {
|
||||||
|
|
||||||
|
#ifndef CAROTENE_NEON_ARCH
|
||||||
|
# if defined(__aarch64__) || defined(__aarch32__)
|
||||||
|
# define CAROTENE_NEON_ARCH 8
|
||||||
|
# else
|
||||||
|
# define CAROTENE_NEON_ARCH 7
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
|
||||||
|
# error("ARMv7 doen't support A32/A64 Neon instructions")
|
||||||
|
#endif
|
||||||
|
|
||||||
inline void prefetch(const void *ptr, size_t offset = 32*10)
|
inline void prefetch(const void *ptr, size_t offset = 32*10)
|
||||||
{
|
{
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
|
215
3rdparty/carotene/src/convert_scale.cpp
vendored
215
3rdparty/carotene/src/convert_scale.cpp
vendored
@ -38,6 +38,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
namespace CAROTENE_NS {
|
namespace CAROTENE_NS {
|
||||||
|
|
||||||
@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC1(u8, 16,
|
CVTS_FUNC1(u8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
||||||
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
||||||
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
|
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
|
||||||
@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u8, s8, 16,
|
CVTS_FUNC(u8, s8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
||||||
int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
||||||
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
|
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
|
||||||
@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u8, u16, 16,
|
CVTS_FUNC(u8, u16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
|
vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
|
||||||
vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
|
vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
|
||||||
}
|
}
|
||||||
@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u8, s16, 16,
|
CVTS_FUNC(u8, s16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
|
vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
|
||||||
vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
|
vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
|
||||||
}
|
}
|
||||||
@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u8, s32, 16,
|
CVTS_FUNC(u8, s32, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
vst1q_s32(_dst + i + 8, vline3_s32);
|
vst1q_s32(_dst + i + 8, vline3_s32);
|
||||||
@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s8, u8, 16,
|
CVTS_FUNC(s8, u8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
||||||
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
||||||
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
|
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
|
||||||
@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC1(s8, 16,
|
CVTS_FUNC1(s8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
||||||
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
||||||
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
|
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
|
||||||
@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s8, u16, 16,
|
CVTS_FUNC(s8, u16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
|
||||||
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
|
||||||
vst1q_u16(_dst + i + 0, vRes1_u16);
|
vst1q_u16(_dst + i + 0, vRes1_u16);
|
||||||
@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s8, s16, 16,
|
CVTS_FUNC(s8, s16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
|
||||||
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
|
||||||
vst1q_s16(_dst + i + 0, vRes1_s16);
|
vst1q_s16(_dst + i + 0, vRes1_s16);
|
||||||
@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s8, s32, 16,
|
CVTS_FUNC(s8, s32, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 16)
|
for (size_t i = 0; i < w; i += 16)
|
||||||
{
|
{
|
||||||
@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16,
|
|||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
vline3_f32 = vaddq_f32(vline3_f32, vshift);
|
||||||
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
vline4_f32 = vaddq_f32(vline4_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vline3_s32 = vcvtq_s32_f32(vline3_f32);
|
vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
|
||||||
vline4_s32 = vcvtq_s32_f32(vline4_f32);
|
vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
vst1q_s32(_dst + i + 8, vline3_s32);
|
vst1q_s32(_dst + i + 8, vline3_s32);
|
||||||
@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u16, u8, 16,
|
CVTS_FUNC(u16, u8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
|
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u16, s8, 16,
|
CVTS_FUNC(u16, s8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC1(u16, 16,
|
CVTS_FUNC1(u16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
||||||
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
||||||
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
||||||
@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u16, s16, 8,
|
CVTS_FUNC(u16, s16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
||||||
@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(u16, s32, 8,
|
CVTS_FUNC(u16, s32, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
}
|
}
|
||||||
@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s16, u8, 16,
|
CVTS_FUNC(s16, u8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
|
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s16, s8, 16,
|
CVTS_FUNC(s16, s8, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s16, u16, 8,
|
CVTS_FUNC(s16, u16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
||||||
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
||||||
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
||||||
@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC1(s16, 16,
|
CVTS_FUNC1(s16, 16,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
||||||
@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s16, s32, 8,
|
CVTS_FUNC(s16, s32, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
}
|
}
|
||||||
@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s32, u8, 8,
|
CVTS_FUNC(s32, u8, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
||||||
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
||||||
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
|
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
|
||||||
@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s32, s8, 8,
|
CVTS_FUNC(s32, s8, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s32, u16, 8,
|
CVTS_FUNC(s32, u16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
|
||||||
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
|
||||||
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
||||||
@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(s32, s16, 8,
|
CVTS_FUNC(s32, s16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
||||||
@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC1(s32, 8,
|
CVTS_FUNC1(s32, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
}
|
}
|
||||||
@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(f32, s8, 8,
|
CVTS_FUNC(f32, s8, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
|
||||||
@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(f32, u16, 8,
|
CVTS_FUNC(f32, u16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
|
uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32);
|
||||||
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
|
uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32);
|
||||||
uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
|
uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
|
||||||
uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
|
uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
|
||||||
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
|
||||||
@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(f32, s16, 8,
|
CVTS_FUNC(f32, s16, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
|
||||||
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
|
||||||
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
|
||||||
@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8,
|
|||||||
#else
|
#else
|
||||||
CVTS_FUNC(f32, s32, 8,
|
CVTS_FUNC(f32, s32, 8,
|
||||||
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
float32x4_t vscale = vdupq_n_f32((f32)alpha);
|
||||||
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
|
float32x4_t vshift = vdupq_n_f32((f32)beta);,
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < w; i += 8)
|
for (size_t i = 0; i < w; i += 8)
|
||||||
{
|
{
|
||||||
@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8,
|
|||||||
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
vline2_f32 = vmulq_f32(vline2_f32, vscale);
|
||||||
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
vline1_f32 = vaddq_f32(vline1_f32, vshift);
|
||||||
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
vline2_f32 = vaddq_f32(vline2_f32, vshift);
|
||||||
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
|
int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
|
||||||
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
|
int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
|
||||||
vst1q_s32(_dst + i + 0, vline1_s32);
|
vst1q_s32(_dst + i + 0, vline1_s32);
|
||||||
vst1q_s32(_dst + i + 4, vline2_s32);
|
vst1q_s32(_dst + i + 4, vline2_s32);
|
||||||
}
|
}
|
||||||
|
23
3rdparty/carotene/src/div.cpp
vendored
23
3rdparty/carotene/src/div.cpp
vendored
@ -39,6 +39,7 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
#include "vtransform.hpp"
|
#include "vtransform.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
@ -51,13 +52,6 @@ namespace {
|
|||||||
|
|
||||||
#ifdef CAROTENE_NEON
|
#ifdef CAROTENE_NEON
|
||||||
|
|
||||||
inline float32x4_t vroundq(const float32x4_t& v)
|
|
||||||
{
|
|
||||||
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
|
||||||
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
|
|
||||||
return vaddq_f32(v, v_addition);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
||||||
{
|
{
|
||||||
@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
|||||||
}
|
}
|
||||||
template <>
|
template <>
|
||||||
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
|
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
|
||||||
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
|
{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
|
||||||
template <>
|
template <>
|
||||||
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
|
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
|
||||||
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
|
{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
|
||||||
|
|
||||||
inline float32x2_t vround(const float32x2_t& v)
|
|
||||||
{
|
|
||||||
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
|
|
||||||
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
|
|
||||||
return vadd_f32(v, v_addition);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T divSaturate(const T &v1, const T &v2, const float scale)
|
inline T divSaturate(const T &v1, const T &v2, const float scale)
|
||||||
@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale)
|
|||||||
}
|
}
|
||||||
template <>
|
template <>
|
||||||
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
|
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
|
||||||
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
|
{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
|
||||||
template <>
|
template <>
|
||||||
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
|
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
|
||||||
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
|
{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
15
3rdparty/carotene/src/phase.cpp
vendored
15
3rdparty/carotene/src/phase.cpp
vendored
@ -41,6 +41,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
#include "vround_helper.hpp"
|
||||||
|
|
||||||
namespace CAROTENE_NS {
|
namespace CAROTENE_NS {
|
||||||
|
|
||||||
@ -121,8 +122,6 @@ void phase(const Size2D &size,
|
|||||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||||
|
|
||||||
float32x4_t v_05 = vdupq_n_f32(0.5f);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < size.height; ++i)
|
for (size_t i = 0; i < size.height; ++i)
|
||||||
{
|
{
|
||||||
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||||
@ -149,8 +148,8 @@ void phase(const Size2D &size,
|
|||||||
float32x4_t v_dst32f1;
|
float32x4_t v_dst32f1;
|
||||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||||
|
|
||||||
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
|
||||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
|
||||||
|
|
||||||
// 1
|
// 1
|
||||||
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
|
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
|
||||||
@ -161,8 +160,8 @@ void phase(const Size2D &size,
|
|||||||
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
|
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
|
||||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||||
|
|
||||||
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
|
||||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
|
||||||
|
|
||||||
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
|
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
|
||||||
vmovn_u16(v_dst16s1)));
|
vmovn_u16(v_dst16s1)));
|
||||||
@ -182,8 +181,8 @@ void phase(const Size2D &size,
|
|||||||
float32x4_t v_dst32f1;
|
float32x4_t v_dst32f1;
|
||||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||||
|
|
||||||
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
|
||||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
|
||||||
|
|
||||||
vst1_u8(dst + j, vmovn_u16(v_dst));
|
vst1_u8(dst + j, vmovn_u16(v_dst));
|
||||||
}
|
}
|
||||||
|
102
3rdparty/carotene/src/vround_helper.hpp
vendored
Normal file
102
3rdparty/carotene/src/vround_helper.hpp
vendored
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
/*
|
||||||
|
* By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
* If you do not agree to this license, do not download, install,
|
||||||
|
* copy or use the software.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* License Agreement
|
||||||
|
* For Open Source Computer Vision Library
|
||||||
|
* (3-clause BSD License)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||||
|
* Third party copyrights are property of their respective owners.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
* are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* * Neither the names of the copyright holders nor the names of the contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
* any express or implied warranties, including, but not limited to, the implied
|
||||||
|
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
* In no event shall copyright holders or contributors be liable for any direct,
|
||||||
|
* indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
* (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
* loss of use, data, or profits; or business interruption) however caused
|
||||||
|
* and on any theory of liability, whether in contract, strict liability,
|
||||||
|
* or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
* the use of this software, even if advised of the possibility of such damage.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef CAROTENE_SRC_VROUND_HELPER_HPP
|
||||||
|
#define CAROTENE_SRC_VROUND_HELPER_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "vtransform.hpp"
|
||||||
|
|
||||||
|
#ifdef CAROTENE_NEON
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
|
||||||
|
* See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
|
||||||
|
*/
|
||||||
|
|
||||||
|
// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
|
||||||
|
#define CAROTENE_ROUND_DELTA (12582912.0f)
|
||||||
|
|
||||||
|
namespace CAROTENE_NS { namespace internal {
|
||||||
|
|
||||||
|
inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
|
||||||
|
{
|
||||||
|
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
|
||||||
|
return vcvtnq_u32_f32(val);
|
||||||
|
#else
|
||||||
|
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
|
||||||
|
return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32x2_t vround_u32_f32(const float32x2_t val)
|
||||||
|
{
|
||||||
|
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
|
||||||
|
return vcvtn_u32_f32(val);
|
||||||
|
#else
|
||||||
|
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
|
||||||
|
return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int32x4_t vroundq_s32_f32(const float32x4_t val)
|
||||||
|
{
|
||||||
|
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
|
||||||
|
return vcvtnq_s32_f32(val);
|
||||||
|
#else
|
||||||
|
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
|
||||||
|
return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int32x2_t vround_s32_f32(const float32x2_t val)
|
||||||
|
{
|
||||||
|
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
|
||||||
|
return vcvtn_s32_f32(val);
|
||||||
|
#else
|
||||||
|
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
|
||||||
|
return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} }
|
||||||
|
|
||||||
|
#endif // CAROTENE_NEON
|
||||||
|
|
||||||
|
#endif
|
@ -915,7 +915,15 @@ foreach(hal ${OpenCV_HAL})
|
|||||||
if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
|
if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
|
||||||
add_subdirectory(3rdparty/carotene/hal)
|
add_subdirectory(3rdparty/carotene/hal)
|
||||||
ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
|
ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
|
||||||
list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
|
|
||||||
|
if( NOT DEFINED CAROTENE_NEON_ARCH)
|
||||||
|
set(CAROTENE_NEON_MSG "Auto detected")
|
||||||
|
elseif( CAROTENE_NEON_ARCH GREATER 7)
|
||||||
|
set(CAROTENE_NEON_MSG "Force ARMv8+")
|
||||||
|
else()
|
||||||
|
set(CAROTENE_NEON_MSG "Force ARMv7")
|
||||||
|
endif()
|
||||||
|
list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Carotene: NEON is not available, disabling carotene...")
|
message(STATUS "Carotene: NEON is not available, disabling carotene...")
|
||||||
endif()
|
endif()
|
||||||
|
@ -585,6 +585,7 @@ Following options can be used to change installation layout for common scenarios
|
|||||||
| `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
|
| `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
|
||||||
| `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
|
| `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
|
||||||
| `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
|
| `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
|
||||||
|
| `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. |
|
||||||
|
|
||||||
TODO: need separate tutorials covering bindings builds
|
TODO: need separate tutorials covering bindings builds
|
||||||
|
|
||||||
|
@ -1990,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
|
|||||||
#else
|
#else
|
||||||
inline v_int32x4 v_round(const v_float32x4& a)
|
inline v_int32x4 v_round(const v_float32x4& a)
|
||||||
{
|
{
|
||||||
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
|
// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
|
||||||
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
float32x4_t delta = vdupq_n_f32(12582912.0f);
|
||||||
|
return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
|
||||||
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
|
|
||||||
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
inline v_int32x4 v_floor(const v_float32x4& a)
|
inline v_int32x4 v_floor(const v_float32x4& a)
|
||||||
|
@ -1570,4 +1570,54 @@ TEST(Core_Arithm, scalar_handling_19599) // https://github.com/opencv/opencv/is
|
|||||||
EXPECT_EQ(1, c.rows);
|
EXPECT_EQ(1, c.rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://github.com/opencv/opencv/issues/24163
|
||||||
|
typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
|
||||||
|
typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
|
||||||
|
|
||||||
|
#if defined __riscv
|
||||||
|
TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
|
||||||
|
#else
|
||||||
|
TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
const int matDepth = get<0>(GetParam());
|
||||||
|
const int matHeight= get<1>(GetParam());
|
||||||
|
const int matWidth = 3; // Fixed
|
||||||
|
const int alpha = get<2>(GetParam());
|
||||||
|
const int beta = get<3>(GetParam());
|
||||||
|
|
||||||
|
// If alpha and/or beta are negative, and matDepth is unsigned, test is passed.
|
||||||
|
if( ( (alpha < 0) || (beta < 0) )
|
||||||
|
&&
|
||||||
|
( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) )
|
||||||
|
{
|
||||||
|
throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) );
|
||||||
|
}
|
||||||
|
|
||||||
|
const int matType = CV_MAKE_TYPE(matDepth, 1);
|
||||||
|
const Size matSize(matWidth, matHeight);
|
||||||
|
const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha));
|
||||||
|
const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta));
|
||||||
|
const Mat result = ( src1 + src2 ) / 2;
|
||||||
|
|
||||||
|
// Expected that default is FE_TONEAREST(Ties to Even).
|
||||||
|
const int mean = lrint( static_cast<double>(alpha + beta) / 2.0 );
|
||||||
|
const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean));
|
||||||
|
|
||||||
|
// Compare result and extected.
|
||||||
|
ASSERT_EQ(expected.size(), result.size());
|
||||||
|
EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) <<
|
||||||
|
"result=" << std::endl << result << std::endl <<
|
||||||
|
"expected=" << std::endl << expected;
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163,
|
||||||
|
testing::Combine(
|
||||||
|
testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType
|
||||||
|
testing::Values( 3, 4, 5, 6), // MatHeight
|
||||||
|
testing::Values(-2,-1, 0, 1, 2), // src1
|
||||||
|
testing::Values( -1, 0, 1 ) // src2
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
}} // namespace
|
}} // namespace
|
||||||
|
Loading…
Reference in New Issue
Block a user