weird memory read

2025-06-07 09:25:45 +08:00 · 2025-05-30 22:13:41 +08:00 · 2025-05-30 22:13:41 +08:00 · 8d116fa7f1
commit 8d116fa7f1
parent 2147287cf5
1 changed files with 124 additions and 31 deletions
--- a/hal/riscv-rvv/src/features2d/fast.cpp
+++ b/hal/riscv-rvv/src/features2d/fast.cpp
@ -14,7 +14,7 @@
 #include <cfloat>


-// #define CV_HAL_RVV_FAST_DEBUG
+#define CV_HAL_RVV_FAST_DEBUG
 #ifdef CV_HAL_RVV_FAST_DEBUG
 #include <iostream>
 #include <cstdio>
@ -52,6 +52,17 @@ void printVectorUint16(T vec, int vl, std::string name) {
    free(data);
 }
 template <typename T>
+void printVectorInt16(T vec, int vl, std::string name) {
+    int16_t* data = (int16_t*)malloc(vl * sizeof(int16_t));
+    __riscv_vse16(data, vec, vl);
+    std::cout << name << ": ";
+    for (int i = 0; i < vl; i++) {
+        std::cout << data[i] << " ";
+    }
+    std::cout << std::endl;
+    free(data);
+}
+template <typename T>
 void printVectorUint32(T vec, int vl, std::string name) {
    uint32_t* data = (uint32_t*)malloc(vl * sizeof(uint32_t));
    __riscv_vse32(data, vec, vl);
@ -108,6 +119,14 @@ void printVector(T vec, int vl, std::string name) {
        printVectorUint16(vec, vl, name);
    } else if constexpr (std::is_same<T, vuint16m8_t>::value) {
        printVectorUint16(vec, vl, name);
+    } else if constexpr (std::is_same<T, vint16m1_t>::value) {
+        printVectorInt16(vec, vl, name);
+    } else if constexpr (std::is_same<T, vint16m2_t>::value) {
+        printVectorInt16(vec, vl, name);
+    } else if constexpr (std::is_same<T, vint16m4_t>::value) {
+        printVectorInt16(vec, vl, name);
+    } else if constexpr (std::is_same<T, vint16m8_t>::value) {
+        printVectorInt16(vec, vl, name);
    } else if constexpr (std::is_same<T, vuint32m1_t>::value) {
        printVectorUint32(vec, vl, name);
    } else if constexpr (std::is_same<T, vuint32m2_t>::value) {
@ -129,24 +148,44 @@ using RVV_VECTOR_TYPE = vuint8m4_t;
 // Since uint16_t range is 0 to 65535, row stride should be less than 65535/6 = 10922
 inline void makeOffsets(int16_t pixel[], vuint16m2_t& v_offset, int64_t row_stride, int patternSize)
 {
-    // set min element (pixel[8] = 0 + row_stride * -3) as the base addr
    uint16_t pixel_u[25];
-    pixel_u[0] = row_stride * 6;
-    pixel_u[1] = 1 + row_stride * 6;
-    pixel_u[2] = 2 + row_stride * 5;
-    pixel_u[3] = 3 + row_stride * 4;
-    pixel_u[4] = 3 + row_stride * 3;
-    pixel_u[5] = 3 + row_stride * 2;
-    pixel_u[6] = 2 + row_stride * 1;
-    pixel_u[7] = 1 + row_stride * 0;
-    pixel_u[8] = 0 + row_stride * 0;
-    pixel_u[9] = -1 + row_stride * 0;
-    pixel_u[10] = -2 + row_stride * 1;
-    pixel_u[11] = -3 + row_stride * 2;
-    pixel_u[12] = -3 + row_stride * 3;
-    pixel_u[13] = -3 + row_stride * 4;
-    pixel_u[14] = -2 + row_stride * 5;
-    pixel_u[15] = -1 + row_stride * 6;
+
+    // set min element (pixel[8] = 0 + row_stride * -3) as the base addr
+    // pixel_u[0] = row_stride * 6;
+    // pixel_u[1] = 1 + row_stride * 6;
+    // pixel_u[2] = 2 + row_stride * 5;
+    // pixel_u[3] = 3 + row_stride * 4;
+    // pixel_u[4] = 3 + row_stride * 3;
+    // pixel_u[5] = 3 + row_stride * 2;
+    // pixel_u[6] = 2 + row_stride * 1;
+    // pixel_u[7] = 1 + row_stride * 0;
+    // pixel_u[8] = 0 + row_stride * 0;
+    // pixel_u[9] = -1 + row_stride * 0;
+    // pixel_u[10] = -2 + row_stride * 1;
+    // pixel_u[11] = -3 + row_stride * 2;
+    // pixel_u[12] = -3 + row_stride * 3;
+    // pixel_u[13] = -3 + row_stride * 4;
+    // pixel_u[14] = -2 + row_stride * 5;
+    // pixel_u[15] = -1 + row_stride * 6;
+
+    // set min element (pixel[9] = -1 + row_stride * -3) as the base addr
+    pixel_u[0] = 1 + row_stride * 6;
+    pixel_u[1] = 2 + row_stride * 6;
+    pixel_u[2] = 3 + row_stride * 5;
+    pixel_u[3] = 4 + row_stride * 4;
+    pixel_u[4] = 4 + row_stride * 3;
+    pixel_u[5] = 4 + row_stride * 2;
+    pixel_u[6] = 3 + row_stride * 1;
+    pixel_u[7] = 2 + row_stride * 0;
+    pixel_u[8] = 1 + row_stride * 0;
+    pixel_u[9] = 0 + row_stride * 0;
+    pixel_u[10] = -1 + row_stride * 1;
+    pixel_u[11] = -2 + row_stride * 2;
+    pixel_u[12] = -2 + row_stride * 3;
+    pixel_u[13] = -2 + row_stride * 4;
+    pixel_u[14] = -1 + row_stride * 5;
+    pixel_u[15] = 0 + row_stride * 6;
+
    for (int i = 16; i < 25; i++)
    {
        pixel_u[i] = pixel_u[i - 16];
@ -154,7 +193,7 @@ inline void makeOffsets(int16_t pixel[], vuint16m2_t& v_offset, int64_t row_stri
    v_offset = __riscv_vle16_v_u16m2(pixel_u, 25);
    for (int i = 0; i < 25; i++)
    {
-        pixel[i] = pixel_u[i] - 3 * row_stride;
+        pixel[i] = pixel_u[i] - 3 * row_stride - 1;
    }
 }

@ -163,22 +202,52 @@ template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
    return (T*)(((size_t)ptr + n-1) & -n);
 }

-inline uint8_t cornerScore(const uint8_t* ptr, const vuint16m2_t& v_offset, int64_t row_stride) 
+inline uint8_t cornerScore(const uint8_t* ptr, const vuint16m2_t& v_offset, int64_t row_stride, bool debug = false) 
 {
    const uint32_t K = 8, N = 16 + K + 1;
    uint32_t k, v = ptr[0];
    
    int vl = __riscv_vsetvl_e16m2(N);
-
+    std::string msg;
+    if (debug)
+    {
+        msg = cv::format("riscv fast_16: vl=%d, N=%d", vl, N);
+        CV_LOG_INFO(NULL, msg);
+        std::cout<<"vanilla offset loading" << std::endl;
+        // 3073 3074 2563 2052 1540 1028 515 2 1 0 511 1022 1534 2046 2559 3072 3073 3074 2563 2052 1540 1028 515 2 1
+        uint16_t pixel[25] = {
+            3073, 3074, 2563, 2052, 1540, 1028, 515, 2,
+            1, 0, 511, 1022, 1534, 2046, 2559, 3072,
+            3073, 3074, 2563, 2052, 1540, 1028, 515, 2,
+            1
+        };
+        uint8_t* shift_ptr;
+        shift_ptr = ((uint8_t*)ptr) - 3 * row_stride - 1;
+        for (int i = 0; i < 25; i++)
+        {   
+            std::cout << (int)(shift_ptr[pixel[i]]) << " ";
+        }
+        std::cout << std::endl;
+    }
    // use vloxei16_v to indexed ordered load
    vint16m2_t v_c_pixel = __riscv_vmv_v_x_i16m2((int16_t)v, vl);
    // vloxei only support positive offset
-    vuint8m1_t v_d_u8 = __riscv_vloxei16(ptr - 3 * row_stride, v_offset, vl);
+    vuint8m1_t v_d_u8 = __riscv_vloxei16(ptr - 3 * row_stride - 1, v_offset, vl);
    vuint16m2_t v_d_u16 = __riscv_vzext_vf2(v_d_u8, vl);
    vint16m2_t d = __riscv_vreinterpret_i16m2(v_d_u16);
    // for( k = 0; k < N; k++ )
    //     d[k] = (uint16_t)(v - ptr[pixel[k]]);
+    if (debug)
+    {   
+        printVector(v_offset, vl, "v_offset");
+        printVector(d, vl, "d before sub");
+    }
+    
    d = __riscv_vsub_vv_i16m2(v_c_pixel, d, vl);
+    if (debug) {
+        std::cout << "row_stride: " << row_stride << std::endl;
+        printVector(d, vl, "d");
+    }

    vint16m2_t d_slide = __riscv_vmv_v(d, vl);
    
@ -191,25 +260,42 @@ inline uint8_t cornerScore(const uint8_t* ptr, const vuint16m2_t& v_offset, int6

    for (int i = 0; i < 8; i++)
    {
-        d_slide = __riscv_vslide1down(d, (int16_t)0, vl);
-        ak0 = __riscv_vmin(ak0, d, vl);
-        bk0 = __riscv_vmax(bk0, d, vl);
+        d_slide = __riscv_vslide1down(d_slide, (int16_t)0, vl);
+        ak0 = __riscv_vmin(ak0, d_slide, vl);
+        bk0 = __riscv_vmax(bk0, d_slide, vl);
+    }
+    if(debug) {
+        printVector(ak0, vl, "ak0");
+        printVector(bk0, vl, "bk0");
    }

    q0 = __riscv_vmax(q0, __riscv_vmin(ak0, d, vl), vl);
    q1 = __riscv_vmin(q1, __riscv_vmax(bk0, d, vl), vl);

-    d_slide = __riscv_vslide1down(d, (int16_t)0, vl);
+    if (debug) {
+        printVector(q0, vl, "q0");
+        printVector(q1, vl, "q1");
+    }
+
+    d_slide = __riscv_vslide1down(d_slide, (int16_t)0, vl);
    q0 = __riscv_vmax(q0, __riscv_vmin(ak0, d_slide, vl), vl);
    q1 = __riscv_vmin(q1, __riscv_vmax(bk0, d_slide, vl), vl);

+    if (debug) {
+        printVector(q0, vl, "q0 after slide");
+        printVector(q1, vl, "q1 after slide");
+    }
+
    q1 = __riscv_vrsub(q1, (int16_t)0, vl);
    q0 = __riscv_vmax(q0, q1, vl);

    vint16m1_t res = __riscv_vredmax(q0, __riscv_vmv_s_x_i16m1((int16_t)0, vl), vl);

-    uint8_t result = (uint8_t)__riscv_vmv_x(res);
+    if (debug) {
+        printVector(res, vl, "res");
+    }

+    uint8_t result = (uint8_t)__riscv_vmv_x(res);
    return result;
 }

@ -342,8 +428,15 @@ inline int fast_16(const uchar* src_data, size_t src_step, int width, int height
                        if( (m[k / 8] >> (k % 8)) & 1 )
                        {
                            cornerpos[ncorners++] = j + k;
-                            if(nonmax_suppression)
-                                curr[j + k] = (uchar)cornerScore(ptr + k, v_offset, (int64_t)src_step);
+                            if(nonmax_suppression) {
+                                bool debug = false;
+                                int debug_x = 15;
+                                int debug_y = 357;
+                                debug = (debug_x == i && debug_y == j + k);
+                                curr[j + k] = (uchar)cornerScore(ptr + k, v_offset, (int64_t)src_step, debug);
+                                // msg = cv::format("keypoint = (%d, %d, %f, %f, %d), debug = %d", j + k, i, 7.f, -1.f, curr[j + k], debug);
+                                // CV_LOG_INFO(NULL, msg);
+                            }
                        }
                    }
                }
@ -370,8 +463,8 @@ inline int fast_16(const uchar* src_data, size_t src_step, int width, int height
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                KeyPoint kp((float)j, (float)(i-1), 7.f, -1, (float)score);
-                msg = cv::format("keypoint = (%f, %f, %f, %f, %f)", kp.pt.x, kp.pt.y, kp.size, kp.angle, kp.response);
-                CV_LOG_INFO(NULL, msg);
+                // msg = cv::format("keypoint = (%f, %f, %f, %f, %f)", kp.pt.x, kp.pt.y, kp.size, kp.angle, kp.response);
+                // CV_LOG_INFO(NULL, msg);
                keypoints.push_back(kp);
            }
        }