mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #26321 from vpisarev:better_bfloat
2x more accurate float => bfloat conversion #26321 There is a magic trick to make float => bfloat conversion more accurate (_original reference needed, is it done this way in PyTorch?_). In simplified form it looks like: ``` uint16_t f2bf(float x) { union { unsigned u; float f; } u; u.f = x; // return (uint16_t)(u.u >> 16); <== the old method before this patch return (uint16_t)((u.u + 0x8000) >> 16); } ``` it works correctly for almost all valid floating-point values, positive, zero or negative, and even for some extreme cases, like `+/-inf`, `nan` etc. The addition of `0x8000` to integer representation of 32-bit float before retrieving the highest 16 bits reduces the rounding error by ~2x. The slight problem with this improved method is that the numbers very close to or equal to `+/-FLT_MAX` are mistakenly converted to `+/-inf`, respectively. This patch implements improved algorithm for `float => bfloat` conversion in scalar and vector form; it fixes the above-mentioned problem using some extra bit magic, i.e. 0x8000 is not added to very big (by absolute value) numbers: ``` // the actual implementation is more efficient, // without conditions or floating-point operations, see the source code return (uint16_t)(u.u + (fabsf(x) <= big_threshold ? 0x8000 : 0)) >> 16); ``` The corresponding test has been added as well and this is output from the test: ``` [----------] 1 test from Core_BFloat [ RUN ] Core_BFloat.convert maxerr0 = 0.00774842, mean0 = 0.00190643, stddev0 = 0.00186063 maxerr1 = 0.00389057, mean1 = 0.000952614, stddev1 = 0.000931268 [ OK ] Core_BFloat.convert (7 ms) ``` Here `maxerr0, mean0, stddev0` are for the original method and `maxerr1, mean1, stddev1` are for the new method. As you can see, there is a significant improvement in accuracy. **Note:** _Actually, on ~32,000,000 random FP32 numbers with uniformly distributed sign, exponent and mantissa the new method is always at least as accurate as the old one._ The test also checks all the corner cases, where we see no degradation either vs the original method. - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
1696819abb
commit
2f35847960
@ -947,7 +947,7 @@ public:
|
||||
{
|
||||
Cv32suf in;
|
||||
in.f = x;
|
||||
w = (ushort)(in.u >> 16);
|
||||
w = (ushort)((in.u + (((in.u & 0x7fffffff) <= 0x7f7f7fff) << 15)) >> 16);
|
||||
}
|
||||
|
||||
operator float() const
|
||||
|
@ -766,10 +766,11 @@ namespace CV__SIMD_NAMESPACE {
|
||||
|
||||
inline void v_pack_store(const bfloat* ptr, const v_float32& v)
|
||||
{
|
||||
v_int32 iv = v_shr<16>(v_reinterpret_as_s32(v));
|
||||
cv::v_pack_store((short*)ptr, iv);
|
||||
v_int32 av = v_reinterpret_as_s32(v_abs(v));
|
||||
v_int32 iv = v_reinterpret_as_s32(v);
|
||||
iv = v_add(iv, v_and(v_le(av, vx_setall_s32(0x7f7f7fff)), vx_setall_s32(0x8000)));
|
||||
cv::v_pack_store((short*)ptr, v_shr<16>(iv));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/** @brief SIMD processing state cleanup call */
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <math.h>
|
||||
#include "opencv2/core/softfloat.hpp"
|
||||
#include "opencv2/core/core_c.h"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace opencv_test { namespace {
|
||||
|
||||
@ -3354,5 +3355,97 @@ TEST(Core_FastMath, InlineIsInf)
|
||||
EXPECT_EQ( cvIsInf(suf.f), 0);
|
||||
}
|
||||
|
||||
TEST(Core_BFloat, CornerCases)
|
||||
{
|
||||
float data[] = {0.f, -0.f, 1.f, -1.f, expf(1.f), FLT_MAX, -FLT_MAX,
|
||||
std::numeric_limits<float>::infinity(),
|
||||
-std::numeric_limits<float>::infinity(),
|
||||
std::numeric_limits<float>::quiet_NaN()
|
||||
};
|
||||
size_t n0 = sizeof(data)/sizeof(data[0]);
|
||||
for (size_t i = 0; i < n0; i++) {
|
||||
float x = data[i];
|
||||
Cv32suf suf0, suf1;
|
||||
suf0.f = x;
|
||||
uint16_t x0 = (uint16_t)(suf0.u >> 16);
|
||||
bfloat x1 = bfloat(x);
|
||||
suf0.u = x0 << 16;
|
||||
suf1.f = (float)x1;
|
||||
//printf("%zu. orig = %f, restored (old) = %f, restored (new) = %f\n", i, x, suf0.f, suf1.f);
|
||||
if (suf0.u != suf1.u) {
|
||||
EXPECT_LE(fabs(suf1.f - x), fabs(suf0.f - x));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Core_BFloat, convert)
|
||||
{
|
||||
size_t N = 1 << 20;
|
||||
std::vector<float> bigdata(N);
|
||||
RNG& rng = theRNG();
|
||||
int m_max = 1 << 24;
|
||||
double m_scale = 1./m_max;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
double m = (m_max + rng.uniform(0, m_max))*m_scale;
|
||||
double e = pow(2., rng.uniform(-127, 127));
|
||||
double s = rng.uniform(0, 2)*2 - 1;
|
||||
float x = (float)(s*m*e);
|
||||
bigdata[i] = x;
|
||||
}
|
||||
|
||||
double sum0 = 0, sqsum0 = 0, maxerr0 = 0, maxerr1 = 0, sum1 = 0, sqsum1 = 0;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
float x = bigdata[i];
|
||||
Cv32suf suf0, suf1;
|
||||
suf0.f = suf1.f = x;
|
||||
uint16_t x0 = (uint16_t)(suf0.u >> 16);
|
||||
bfloat x1 = bfloat(suf1.f);
|
||||
suf0.u = x0 << 16;
|
||||
suf1.f = (float)x1;
|
||||
double err0 = fabs(x - suf0.f)/(fabs(x) + DBL_EPSILON);
|
||||
double err1 = fabs(x - suf1.f)/(fabs(x) + DBL_EPSILON);
|
||||
maxerr0 = std::max(maxerr0, err0);
|
||||
maxerr1 = std::max(maxerr1, err1);
|
||||
sum0 += err0;
|
||||
sqsum0 += err0*err0;
|
||||
sum1 += err1;
|
||||
sqsum1 += err1*err1;
|
||||
}
|
||||
double mean0 = sum0/N;
|
||||
double stddev0 = sqrt(std::max(sqsum0 - N*mean0*mean0, 0.)/(N-1));
|
||||
double mean1 = sum1/N;
|
||||
double stddev1 = sqrt(std::max(sqsum1 - N*mean1*mean1, 0.)/(N-1));
|
||||
|
||||
//if (maxerr1 > maxerr0 || mean1 > mean0 || stddev1 > stddev0)
|
||||
{
|
||||
printf("maxerr0 = %g, mean0 = %g, stddev0 = %g\nmaxerr1 = %g, mean1 = %g, stddev1 = %g\n",
|
||||
maxerr0, mean0, stddev0, maxerr1, mean1, stddev1);
|
||||
}
|
||||
|
||||
EXPECT_LE(maxerr1, maxerr0);
|
||||
EXPECT_LE(mean1, mean0);
|
||||
EXPECT_LE(stddev1, stddev0);
|
||||
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
//printf("checking vector part ...\n");
|
||||
std::vector<bfloat> bfdata(N);
|
||||
int vlanes = VTraits<v_float32>::vlanes();
|
||||
for (size_t i = 0; i < N; i += vlanes)
|
||||
{
|
||||
v_float32 x = v_load(&bigdata[i]);
|
||||
v_pack_store(&bfdata[i], x);
|
||||
}
|
||||
|
||||
int vecerr = 0;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
Cv32suf suf0, suf1;
|
||||
suf0.f = (float)bfloat(bigdata[i]);
|
||||
suf1.f = (float)bfdata[i];
|
||||
vecerr += suf0.u != suf1.u;
|
||||
}
|
||||
EXPECT_EQ(0, vecerr);
|
||||
#endif
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
/* End of file. */
|
||||
|
Loading…
Reference in New Issue
Block a user