Merge pull request #26321 from vpisarev:better_bfloat

2x more accurate float => bfloat conversion #26321 There is a magic trick to make float => bfloat conversion more accurate (_original reference needed, is it done this way in PyTorch?_). In simplified form it looks like: ``` uint16_t f2bf(float x) { union { unsigned u; float f; } u; u.f = x; // return (uint16_t)(u.u >> 16); <== the old method before this patch return (uint16_t)((u.u + 0x8000) >> 16); } ``` it works correctly for almost all valid floating-point values, positive, zero or negative, and even for some extreme cases, like `+/-inf`, `nan` etc. The addition of `0x8000` to integer representation of 32-bit float before retrieving the highest 16 bits reduces the rounding error by ~2x. The slight problem with this improved method is that the numbers very close to or equal to `+/-FLT_MAX` are mistakenly converted to `+/-inf`, respectively. This patch implements improved algorithm for `float => bfloat` conversion in scalar and vector form; it fixes the above-mentioned problem using some extra bit magic, i.e. 0x8000 is not added to very big (by absolute value) numbers: ``` // the actual implementation is more efficient, // without conditions or floating-point operations, see the source code return (uint16_t)(u.u + (fabsf(x) <= big_threshold ? 0x8000 : 0)) >> 16); ``` The corresponding test has been added as well and this is output from the test: ``` [----------] 1 test from Core_BFloat [ RUN ] Core_BFloat.convert maxerr0 = 0.00774842, mean0 = 0.00190643, stddev0 = 0.00186063 maxerr1 = 0.00389057, mean1 = 0.000952614, stddev1 = 0.000931268 [ OK ] Core_BFloat.convert (7 ms) ``` Here `maxerr0, mean0, stddev0` are for the original method and `maxerr1, mean1, stddev1` are for the new method. As you can see, there is a significant improvement in accuracy. **Note:** _Actually, on ~32,000,000 random FP32 numbers with uniformly distributed sign, exponent and mantissa the new method is always at least as accurate as the old one._ The test also checks all the corner cases, where we see no degradation either vs the original method. - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-08-06 14:36:36 +08:00 · 2024-10-18 14:46:40 +03:00 · 2024-10-18 14:46:40 +03:00 · 2f35847960
commit 2f35847960
parent 1696819abb
3 changed files with 98 additions and 4 deletions
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -947,7 +947,7 @@ public:
    {
        Cv32suf in;
        in.f = x;
-        w = (ushort)(in.u >> 16);
+        w = (ushort)((in.u + (((in.u & 0x7fffffff) <= 0x7f7f7fff) << 15)) >> 16);
    }

    operator float() const
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -766,10 +766,11 @@ namespace CV__SIMD_NAMESPACE {

    inline void v_pack_store(const bfloat* ptr, const v_float32& v)
    {
-        v_int32 iv = v_shr<16>(v_reinterpret_as_s32(v));
-        cv::v_pack_store((short*)ptr, iv);
+        v_int32 av = v_reinterpret_as_s32(v_abs(v));
+        v_int32 iv = v_reinterpret_as_s32(v);
+        iv = v_add(iv, v_and(v_le(av, vx_setall_s32(0x7f7f7fff)), vx_setall_s32(0x8000)));
+        cv::v_pack_store((short*)ptr, v_shr<16>(iv));
    }
-
    #endif

    /** @brief SIMD processing state cleanup call */
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@ -11,6 +11,7 @@
 #include <math.h>
 #include "opencv2/core/softfloat.hpp"
 #include "opencv2/core/core_c.h"
+#include "opencv2/core/hal/intrin.hpp"

 namespace opencv_test { namespace {

@ -3354,5 +3355,97 @@ TEST(Core_FastMath, InlineIsInf)
    EXPECT_EQ( cvIsInf(suf.f), 0);
 }

+TEST(Core_BFloat, CornerCases)
+{
+    float data[] = {0.f, -0.f, 1.f, -1.f, expf(1.f), FLT_MAX, -FLT_MAX,
+                    std::numeric_limits<float>::infinity(),
+                    -std::numeric_limits<float>::infinity(),
+                    std::numeric_limits<float>::quiet_NaN()
+    };
+    size_t n0 = sizeof(data)/sizeof(data[0]);
+    for (size_t i = 0; i < n0; i++) {
+        float x = data[i];
+        Cv32suf suf0, suf1;
+        suf0.f = x;
+        uint16_t x0 = (uint16_t)(suf0.u >> 16);
+        bfloat x1 = bfloat(x);
+        suf0.u = x0 << 16;
+        suf1.f = (float)x1;
+        //printf("%zu. orig = %f, restored (old) = %f, restored (new) = %f\n", i, x, suf0.f, suf1.f);
+        if (suf0.u != suf1.u) {
+            EXPECT_LE(fabs(suf1.f - x), fabs(suf0.f - x));
+        }
+    }
+}
+
+TEST(Core_BFloat, convert)
+{
+    size_t N = 1 << 20;
+    std::vector<float> bigdata(N);
+    RNG& rng = theRNG();
+    int m_max = 1 << 24;
+    double m_scale = 1./m_max;
+    for (size_t i = 0; i < N; i++) {
+        double m = (m_max + rng.uniform(0, m_max))*m_scale;
+        double e = pow(2., rng.uniform(-127, 127));
+        double s = rng.uniform(0, 2)*2 - 1;
+        float x = (float)(s*m*e);
+        bigdata[i] = x;
+    }
+
+    double sum0 = 0, sqsum0 = 0, maxerr0 = 0, maxerr1 = 0, sum1 = 0, sqsum1 = 0;
+    for (size_t i = 0; i < N; i++) {
+        float x = bigdata[i];
+        Cv32suf suf0, suf1;
+        suf0.f = suf1.f = x;
+        uint16_t x0 = (uint16_t)(suf0.u >> 16);
+        bfloat x1 = bfloat(suf1.f);
+        suf0.u = x0 << 16;
+        suf1.f = (float)x1;
+        double err0 = fabs(x - suf0.f)/(fabs(x) + DBL_EPSILON);
+        double err1 = fabs(x - suf1.f)/(fabs(x) + DBL_EPSILON);
+        maxerr0 = std::max(maxerr0, err0);
+        maxerr1 = std::max(maxerr1, err1);
+        sum0 += err0;
+        sqsum0 += err0*err0;
+        sum1 += err1;
+        sqsum1 += err1*err1;
+    }
+    double mean0 = sum0/N;
+    double stddev0 = sqrt(std::max(sqsum0 - N*mean0*mean0, 0.)/(N-1));
+    double mean1 = sum1/N;
+    double stddev1 = sqrt(std::max(sqsum1 - N*mean1*mean1, 0.)/(N-1));
+
+    //if (maxerr1 > maxerr0 || mean1 > mean0 || stddev1 > stddev0)
+    {
+        printf("maxerr0 = %g, mean0 = %g, stddev0 = %g\nmaxerr1 = %g, mean1 = %g, stddev1 = %g\n",
+               maxerr0, mean0, stddev0, maxerr1, mean1, stddev1);
+    }
+
+    EXPECT_LE(maxerr1, maxerr0);
+    EXPECT_LE(mean1, mean0);
+    EXPECT_LE(stddev1, stddev0);
+
+#if CV_SIMD || CV_SIMD_SCALABLE
+    //printf("checking vector part ...\n");
+    std::vector<bfloat> bfdata(N);
+    int vlanes = VTraits<v_float32>::vlanes();
+    for (size_t i = 0; i < N; i += vlanes)
+    {
+        v_float32 x = v_load(&bigdata[i]);
+        v_pack_store(&bfdata[i], x);
+    }
+
+    int vecerr = 0;
+    for (size_t i = 0; i < N; i++) {
+        Cv32suf suf0, suf1;
+        suf0.f = (float)bfloat(bigdata[i]);
+        suf1.f = (float)bfdata[i];
+        vecerr += suf0.u != suf1.u;
+    }
+    EXPECT_EQ(0, vecerr);
+#endif
+}
+
 }} // namespace
 /* End of file. */