mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
core: follow IEEE 754 rules for floating-point division
This commit is contained in:
parent
09cb329d73
commit
fd832bb57d
58
3rdparty/carotene/src/div.cpp
vendored
58
3rdparty/carotene/src/div.cpp
vendored
@ -151,6 +151,10 @@ void div(const Size2D &size,
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::vec64 vec64;
|
||||
|
||||
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
|
||||
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
|
||||
#endif
|
||||
|
||||
if (scale == 0.0f ||
|
||||
(std::numeric_limits<T>::is_integer &&
|
||||
(scale * std::numeric_limits<T>::max()) < 1.0f &&
|
||||
@ -311,6 +315,10 @@ void recip(const Size2D &size,
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::vec64 vec64;
|
||||
|
||||
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
|
||||
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
|
||||
#endif
|
||||
|
||||
if (scale == 0.0f ||
|
||||
(std::numeric_limits<T>::is_integer &&
|
||||
scale < 1.0f &&
|
||||
@ -463,8 +471,6 @@ void div(const Size2D &size,
|
||||
return;
|
||||
}
|
||||
|
||||
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
||||
|
||||
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
|
||||
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
|
||||
|
||||
@ -485,9 +491,7 @@ void div(const Size2D &size,
|
||||
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
|
||||
vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
@ -495,14 +499,12 @@ void div(const Size2D &size,
|
||||
float32x2_t v_src0 = vld1_f32(src0 + j);
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
|
||||
vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
|
||||
dst[j] = src0[j] / src1[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -523,10 +525,8 @@ void div(const Size2D &size,
|
||||
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
|
||||
internal::vrecpq_f32(v_src1))), v_mask)));
|
||||
vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale),
|
||||
internal::vrecpq_f32(v_src1)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
@ -534,15 +534,13 @@ void div(const Size2D &size,
|
||||
float32x2_t v_src0 = vld1_f32(src0 + j);
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
|
||||
internal::vrecp_f32(v_src1))), v_mask)));
|
||||
vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale),
|
||||
internal::vrecp_f32(v_src1)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
|
||||
dst[j] = src0[j] * scale / src1[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -620,8 +618,6 @@ void reciprocal(const Size2D &size,
|
||||
return;
|
||||
}
|
||||
|
||||
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
||||
|
||||
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
|
||||
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
|
||||
|
||||
@ -639,23 +635,19 @@ void reciprocal(const Size2D &size,
|
||||
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
|
||||
vst1q_f32(dst + j, internal::vrecpq_f32(v_src1));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
|
||||
vst1_f32(dst + j, internal::vrecp_f32(v_src1));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? 1.0f / src1[j] : 0;
|
||||
dst[j] = 1.0f / src1[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -673,25 +665,19 @@ void reciprocal(const Size2D &size,
|
||||
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
|
||||
scale)),v_mask)));
|
||||
vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
|
||||
scale)), v_mask)));
|
||||
vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? scale / src1[j] : 0;
|
||||
dst[j] = scale / src1[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -415,8 +415,13 @@ The function cv::divide divides one array by another:
|
||||
or a scalar by an array when there is no src1 :
|
||||
\f[\texttt{dst(I) = saturate(scale/src2(I))}\f]
|
||||
|
||||
When src2(I) is zero, dst(I) will also be zero. Different channels of
|
||||
multi-channel arrays are processed independently.
|
||||
Different channels of multi-channel arrays are processed independently.
|
||||
|
||||
For integer types when src2(I) is zero, dst(I) will also be zero.
|
||||
|
||||
@note In case of floating point data there is no special defined behavior for zero src2(I) values.
|
||||
Regular floating-point division is used.
|
||||
Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result values).
|
||||
|
||||
@note Saturation is not applied when the output array has the depth CV_32S. You may even get
|
||||
result of an incorrect sign in the case of overflow.
|
||||
|
@ -516,7 +516,10 @@ div_i( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
T v = 0;
|
||||
if (denom != 0)
|
||||
v = saturate_cast<T>(num*scale_f/denom);
|
||||
dst[i] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -538,7 +541,7 @@ div_f( const T* src1, size_t step1, const T* src2, size_t step2,
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T num = src1[i], denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
|
||||
dst[i] = saturate_cast<T>(num*scale_f/denom);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -559,7 +562,10 @@ recip_i( const T* src2, size_t step2,
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
T v = 0;
|
||||
if (denom != 0)
|
||||
v = saturate_cast<T>(scale_f/denom);
|
||||
dst[i] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -580,7 +586,7 @@ recip_f( const T* src2, size_t step2,
|
||||
for( ; i < width; i++ )
|
||||
{
|
||||
T denom = src2[i];
|
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
|
||||
dst[i] = saturate_cast<T>(scale_f/denom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1433,7 +1433,6 @@ struct Div_SIMD<float>
|
||||
return x;
|
||||
|
||||
v_float32x4 v_scale = v_setall_f32((float)scale);
|
||||
v_float32x4 v_zero = v_setzero_f32();
|
||||
|
||||
for ( ; x <= width - 8; x += 8)
|
||||
{
|
||||
@ -1445,9 +1444,6 @@ struct Div_SIMD<float>
|
||||
v_float32x4 res0 = f0 * v_scale / f2;
|
||||
v_float32x4 res1 = f1 * v_scale / f3;
|
||||
|
||||
res0 = v_select(f2 == v_zero, v_zero, res0);
|
||||
res1 = v_select(f3 == v_zero, v_zero, res1);
|
||||
|
||||
v_store(dst + x, res0);
|
||||
v_store(dst + x + 4, res1);
|
||||
}
|
||||
@ -1675,7 +1671,6 @@ struct Recip_SIMD<float>
|
||||
return x;
|
||||
|
||||
v_float32x4 v_scale = v_setall_f32((float)scale);
|
||||
v_float32x4 v_zero = v_setzero_f32();
|
||||
|
||||
for ( ; x <= width - 8; x += 8)
|
||||
{
|
||||
@ -1685,9 +1680,6 @@ struct Recip_SIMD<float>
|
||||
v_float32x4 res0 = v_scale / f0;
|
||||
v_float32x4 res1 = v_scale / f1;
|
||||
|
||||
res0 = v_select(f0 == v_zero, v_zero, res0);
|
||||
res1 = v_select(f1 == v_zero, v_zero, res1);
|
||||
|
||||
v_store(dst + x, res0);
|
||||
v_store(dst + x + 4, res1);
|
||||
}
|
||||
@ -1712,7 +1704,6 @@ struct Div_SIMD<double>
|
||||
return x;
|
||||
|
||||
v_float64x2 v_scale = v_setall_f64(scale);
|
||||
v_float64x2 v_zero = v_setzero_f64();
|
||||
|
||||
for ( ; x <= width - 4; x += 4)
|
||||
{
|
||||
@ -1724,9 +1715,6 @@ struct Div_SIMD<double>
|
||||
v_float64x2 res0 = f0 * v_scale / f2;
|
||||
v_float64x2 res1 = f1 * v_scale / f3;
|
||||
|
||||
res0 = v_select(f2 == v_zero, v_zero, res0);
|
||||
res1 = v_select(f3 == v_zero, v_zero, res1);
|
||||
|
||||
v_store(dst + x, res0);
|
||||
v_store(dst + x + 2, res1);
|
||||
}
|
||||
@ -1749,7 +1737,6 @@ struct Recip_SIMD<double>
|
||||
return x;
|
||||
|
||||
v_float64x2 v_scale = v_setall_f64(scale);
|
||||
v_float64x2 v_zero = v_setzero_f64();
|
||||
|
||||
for ( ; x <= width - 4; x += 4)
|
||||
{
|
||||
@ -1759,9 +1746,6 @@ struct Recip_SIMD<double>
|
||||
v_float64x2 res0 = v_scale / f0;
|
||||
v_float64x2 res1 = v_scale / f1;
|
||||
|
||||
res0 = v_select(f0 == v_zero, v_zero, res0);
|
||||
res1 = v_select(f1 == v_zero, v_zero, res1);
|
||||
|
||||
v_store(dst + x, res0);
|
||||
v_store(dst + x + 2, res1);
|
||||
}
|
||||
|
@ -227,9 +227,15 @@
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))
|
||||
|
||||
#elif defined OP_DIV
|
||||
#ifdef CV_DST_TYPE_IS_INTEGER
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2, zero = (workT)(0); \
|
||||
storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))
|
||||
#else
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2; \
|
||||
storedst(convertToDT(srcelem1 / e2))
|
||||
#endif
|
||||
|
||||
#elif defined OP_DIV_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
@ -240,9 +246,15 @@
|
||||
#else
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#endif
|
||||
#ifdef CV_DST_TYPE_IS_INTEGER
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2, zero = (workT)(0); \
|
||||
storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))
|
||||
#else
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2; \
|
||||
storedst(convertToDT(srcelem1 * (workT)(scale) / e2))
|
||||
#endif
|
||||
|
||||
#elif defined OP_RDIV_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
@ -253,16 +265,28 @@
|
||||
#else
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#endif
|
||||
#ifdef CV_DST_TYPE_IS_INTEGER
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1, zero = (workT)(0); \
|
||||
storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))
|
||||
#else
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1; \
|
||||
storedst(convertToDT(srcelem2 * (workT)(scale) / e1))
|
||||
#endif
|
||||
|
||||
#elif defined OP_RECIP_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#ifdef CV_DST_TYPE_IS_INTEGER
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1, zero = (workT)(0); \
|
||||
storedst(convertToDT(e1 != zero ? scale / e1 : zero))
|
||||
#else
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1; \
|
||||
storedst(convertToDT(scale / e1))
|
||||
#endif
|
||||
|
||||
#elif defined OP_ADDW
|
||||
#undef EXTRA_PARAMS
|
||||
|
Loading…
Reference in New Issue
Block a user