Merge pull request #26321 from vpisarev:better_bfloat

2x more accurate float => bfloat conversion #26321

There is a magic trick to make float => bfloat conversion more accurate (_original reference needed, is it done this way in PyTorch?_). In simplified form it looks like:

```
uint16_t f2bf(float x) {
    union {
        unsigned u;
        float f;
    } u;
    u.f = x;
    // return (uint16_t)(u.u >> 16); <== the old method before this patch
    return (uint16_t)((u.u + 0x8000) >> 16);
} 
```

it works correctly for almost all valid floating-point values, positive, zero or negative, and even for some extreme cases, like `+/-inf`, `nan` etc. The addition of `0x8000` to integer representation of 32-bit float before retrieving the highest 16 bits reduces the rounding error by ~2x.

The slight problem with this improved method is that the numbers very close to or equal to `+/-FLT_MAX` are mistakenly converted to `+/-inf`, respectively.

This patch implements improved algorithm for `float => bfloat` conversion in scalar and vector form; it fixes the above-mentioned problem using some extra bit magic, i.e. 0x8000 is not added to very big (by absolute value) numbers:

```
// the actual implementation is more efficient,
// without conditions or floating-point operations, see the source code
return (uint16_t)(u.u + (fabsf(x) <= big_threshold ? 0x8000 : 0)) >> 16);
```

The corresponding test has been added as well and this is output from the test:

```
[----------] 1 test from Core_BFloat
[ RUN      ] Core_BFloat.convert
maxerr0 = 0.00774842, mean0 = 0.00190643, stddev0 = 0.00186063
maxerr1 = 0.00389057, mean1 = 0.000952614, stddev1 = 0.000931268
[       OK ] Core_BFloat.convert (7 ms)
```

Here `maxerr0, mean0, stddev0` are for the original method and `maxerr1, mean1, stddev1` are for the new method. As you can see, there is a significant improvement in accuracy.

**Note:**

_Actually, on ~32,000,000 random FP32 numbers with uniformly distributed sign, exponent and mantissa the new method is always at least as accurate as the old one._

The test also checks all the corner cases, where we see no degradation either vs the original method.

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Vadim Pisarevsky 2024-10-18 14:46:40 +03:00 committed by GitHub
parent 1696819abb
commit 2f35847960
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 98 additions and 4 deletions

View File

@ -947,7 +947,7 @@ public:
{
Cv32suf in;
in.f = x;
w = (ushort)(in.u >> 16);
w = (ushort)((in.u + (((in.u & 0x7fffffff) <= 0x7f7f7fff) << 15)) >> 16);
}
operator float() const

View File

@ -766,10 +766,11 @@ namespace CV__SIMD_NAMESPACE {
inline void v_pack_store(const bfloat* ptr, const v_float32& v)
{
v_int32 iv = v_shr<16>(v_reinterpret_as_s32(v));
cv::v_pack_store((short*)ptr, iv);
v_int32 av = v_reinterpret_as_s32(v_abs(v));
v_int32 iv = v_reinterpret_as_s32(v);
iv = v_add(iv, v_and(v_le(av, vx_setall_s32(0x7f7f7fff)), vx_setall_s32(0x8000)));
cv::v_pack_store((short*)ptr, v_shr<16>(iv));
}
#endif
/** @brief SIMD processing state cleanup call */

View File

@ -11,6 +11,7 @@
#include <math.h>
#include "opencv2/core/softfloat.hpp"
#include "opencv2/core/core_c.h"
#include "opencv2/core/hal/intrin.hpp"
namespace opencv_test { namespace {
@ -3354,5 +3355,97 @@ TEST(Core_FastMath, InlineIsInf)
EXPECT_EQ( cvIsInf(suf.f), 0);
}
TEST(Core_BFloat, CornerCases)
{
float data[] = {0.f, -0.f, 1.f, -1.f, expf(1.f), FLT_MAX, -FLT_MAX,
std::numeric_limits<float>::infinity(),
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::quiet_NaN()
};
size_t n0 = sizeof(data)/sizeof(data[0]);
for (size_t i = 0; i < n0; i++) {
float x = data[i];
Cv32suf suf0, suf1;
suf0.f = x;
uint16_t x0 = (uint16_t)(suf0.u >> 16);
bfloat x1 = bfloat(x);
suf0.u = x0 << 16;
suf1.f = (float)x1;
//printf("%zu. orig = %f, restored (old) = %f, restored (new) = %f\n", i, x, suf0.f, suf1.f);
if (suf0.u != suf1.u) {
EXPECT_LE(fabs(suf1.f - x), fabs(suf0.f - x));
}
}
}
TEST(Core_BFloat, convert)
{
size_t N = 1 << 20;
std::vector<float> bigdata(N);
RNG& rng = theRNG();
int m_max = 1 << 24;
double m_scale = 1./m_max;
for (size_t i = 0; i < N; i++) {
double m = (m_max + rng.uniform(0, m_max))*m_scale;
double e = pow(2., rng.uniform(-127, 127));
double s = rng.uniform(0, 2)*2 - 1;
float x = (float)(s*m*e);
bigdata[i] = x;
}
double sum0 = 0, sqsum0 = 0, maxerr0 = 0, maxerr1 = 0, sum1 = 0, sqsum1 = 0;
for (size_t i = 0; i < N; i++) {
float x = bigdata[i];
Cv32suf suf0, suf1;
suf0.f = suf1.f = x;
uint16_t x0 = (uint16_t)(suf0.u >> 16);
bfloat x1 = bfloat(suf1.f);
suf0.u = x0 << 16;
suf1.f = (float)x1;
double err0 = fabs(x - suf0.f)/(fabs(x) + DBL_EPSILON);
double err1 = fabs(x - suf1.f)/(fabs(x) + DBL_EPSILON);
maxerr0 = std::max(maxerr0, err0);
maxerr1 = std::max(maxerr1, err1);
sum0 += err0;
sqsum0 += err0*err0;
sum1 += err1;
sqsum1 += err1*err1;
}
double mean0 = sum0/N;
double stddev0 = sqrt(std::max(sqsum0 - N*mean0*mean0, 0.)/(N-1));
double mean1 = sum1/N;
double stddev1 = sqrt(std::max(sqsum1 - N*mean1*mean1, 0.)/(N-1));
//if (maxerr1 > maxerr0 || mean1 > mean0 || stddev1 > stddev0)
{
printf("maxerr0 = %g, mean0 = %g, stddev0 = %g\nmaxerr1 = %g, mean1 = %g, stddev1 = %g\n",
maxerr0, mean0, stddev0, maxerr1, mean1, stddev1);
}
EXPECT_LE(maxerr1, maxerr0);
EXPECT_LE(mean1, mean0);
EXPECT_LE(stddev1, stddev0);
#if CV_SIMD || CV_SIMD_SCALABLE
//printf("checking vector part ...\n");
std::vector<bfloat> bfdata(N);
int vlanes = VTraits<v_float32>::vlanes();
for (size_t i = 0; i < N; i += vlanes)
{
v_float32 x = v_load(&bigdata[i]);
v_pack_store(&bfdata[i], x);
}
int vecerr = 0;
for (size_t i = 0; i < N; i++) {
Cv32suf suf0, suf1;
suf0.f = (float)bfloat(bigdata[i]);
suf1.f = (float)bfdata[i];
vecerr += suf0.u != suf1.u;
}
EXPECT_EQ(0, vecerr);
#endif
}
}} // namespace
/* End of file. */