Merge pull request #27238 from nklskyoy:mha-rope

Rotary positional embeddings #27238 Rotary positional embeddings let an attention block encode the relative position between tokens. Widely adopted in LLMs such as LLaMA—and equally applicable to Vision Transformers—the ONNX Runtime com.microsoft.Attention operator already exposes this via its `do_rotary` flag (see docs), and this PR adds support for that flag in OpenCV. _Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention_ Materials: - RoFormer paper https://arxiv.org/abs/2104.09864 - RoPe for Vision Transformer https://github.com/naver-ai/rope-vit - llama implementation https://github.com/meta-llama/llama/blob/main/llama/model.py#L80 Merge with https://github.com/opencv/opencv_extra/pull/1250 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-06-11 11:45:30 +08:00 · 2025-04-28 18:47:23 +02:00 · 2025-04-28 18:47:23 +02:00 · 1df06488b1
commit 1df06488b1
parent 3bfc408995
2 changed files with 166 additions and 0 deletions
--- a/modules/dnn/src/layers/attention_layer.cpp
+++ b/modules/dnn/src/layers/attention_layer.cpp
@ -24,6 +24,105 @@ static void packWeight(size_t num_heads, size_t head_size, size_t input_hidden_s
    }
 }

+
+static void rotationKernel(
+    float* data, const float* rotation_table,
+    size_t seq_len, size_t d
+)
+{
+    CV_Assert(d % 2 == 0);
+    const size_t d_half = d / 2;
+
+    double nstripes = double(seq_len) * d_half * (1.0/1024.0);
+
+    auto fn = [&](const cv::Range& range)
+    {
+        for (int t = range.start; t < range.end; ++t)
+        {
+            float* out_ptr = data + size_t(t) * d;
+            const float* table_ptr = rotation_table + size_t(t) * d;
+            size_t i = 0;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const size_t w = VTraits<v_float32>::vlanes();
+            for (; i + w <= d_half; i += w)
+            {
+                v_float32 sin_v, cos_v, x_even, x_odd;
+                v_load_deinterleave(table_ptr + 2*i, sin_v, cos_v);
+                v_load_deinterleave(out_ptr    + 2*i, x_even, x_odd);
+
+                v_float32 out_even = v_sub(v_mul(cos_v, x_even), v_mul(sin_v, x_odd));
+                v_float32 out_odd  = v_add(v_mul(sin_v, x_even), v_mul(cos_v, x_odd));
+
+                v_store_interleave(out_ptr + 2*i, out_even, out_odd);
+            }
+#endif
+            // scalar tail
+            for (; i < d_half; ++i)
+            {
+                float s  = table_ptr[2*i  ];
+                float c  = table_ptr[2*i+1];
+                float xe = out_ptr[2*i];
+                float xo = out_ptr[2*i+1];
+                out_ptr[2*i]   = xe * c - xo * s;
+                out_ptr[2*i+1] = xo * c + xe * s;
+            }
+        }
+    };
+
+    // This will spin up threads and run fn over [0, seq_len)
+    parallel_for_(cv::Range(0, int(seq_len)), fn, nstripes);
+}
+
+static void precompRotationTable(float *data,
+                                  size_t seq_len,
+                                  size_t d) {
+    // RoPE precomputation
+    // RoPE is a positional encoding method used in transformer models.
+    // It uses sine and cosine functions to encode the position of tokens in a sequence
+    // initially introduced for NLP in https://arxiv.org/pdf/2104.09864
+
+    // assume data is of shape [seq_ken,d]
+    const float  logBase = std::log(10000.0f);
+    const float  inv_d   = 1.0f / float(d);
+    const size_t d_half = d / 2;
+    for (size_t pos = 0; pos < seq_len; ++pos) {
+
+        size_t i = 0;
+        float* data_ptr = data + pos * d;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const size_t w = VTraits<v_float32>::vlanes();
+        const v_float32 v_logBase = vx_setall_f32(logBase);
+        const v_float32 v_inv_d   = vx_setall_f32(inv_d);
+        const v_float32 v_neg2    = vx_setall_f32(-2.0f);
+
+        for (; i + w <= d_half; i+=w) {
+            int idx_buf[VTraits<v_float32>::max_nlanes];
+            for (int k = 0; k < int(w); ++k)
+                idx_buf[k] = int(i + k);
+            // [i, i+1, …, i+w-1]
+            v_float32 v_idx = v_cvt_f32(vx_load(idx_buf));
+            // [10_000^(-i/d), 10_000^(-(i+1)/d), …, 10_000^(-(i+w-1)/d)]
+            v_float32 v_theta = v_exp(v_mul(v_mul(v_neg2, v_mul(v_idx, v_inv_d)), v_logBase));
+            v_theta = v_mul(vx_setall_f32(float(pos)), v_theta);
+            v_float32 sin_v, cos_v;
+            v_sincos(v_theta, sin_v, cos_v);
+            // store back with interleave
+            v_store_interleave(data_ptr + 2*i, sin_v, cos_v);
+        }
+#endif
+        // scalar tail
+        for (; i < d_half; i+=1)
+        {
+            float theta = pos * std::exp(-2.f * i * inv_d * logBase);
+            data_ptr[2*i    ] = std::sin(theta);
+            data_ptr[2*i + 1] = std::cos(theta);
+        }
+    }
+}
+
+
 // Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention
 class AttentionLayerImpl CV_FINAL : public AttentionLayer {
 public:
@ -52,6 +151,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {

        output_ndims = params.get<int>("output_ndims", 3);

+        do_rotary = params.get<bool>("do_rotary", false);
+
        is_prepacked = false;
    }

@ -97,6 +198,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
        internals.push_back(attention_prob_shape);
        internals.push_back(output_buffer_shape);

+        if (do_rotary)
+        {
+            CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
+            const int d = qkv_head_sizes[0];
+            CV_Assert(d % 2 == 0);
+            // pick maximum of q and k head dim
+
+            MatShape rotation_table_shape{seq_len_, d};
+            internals.push_back(rotation_table_shape);
+        }
+
        return false;
    }

@ -154,6 +266,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {

        float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()};
        size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads};
+        CV_Assert(internals.size() == 3 + (do_rotary ? 1 : 0));
+
+        if (do_rotary)
+        {
+            // precompute sin/cos table
+            auto &rope_table = internals.back();
+            auto *rope_table_data = rope_table.ptr<float>();
+            // currently, support rotary embeddings only if q and k head sizes are equal
+            CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
+            precompRotationTable(rope_table_data, seq_len, qkv_head_sizes[0]);
+        }

        // Compute Q/K/V
        auto &gemm_buffer = internals[0];
@ -167,6 +290,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
            const auto *input_data = input.ptr<const float>();
            const auto *bias_data = bias.ptr<const float>();

+            // If rotary is false, evaluates to internals[2], which is the output_buffer
+            // but this is not dramatic, because in case rotary is false, the table is not used
+            const auto &rope_table = internals.back();
+
            opt.multi_thread = false;
            auto fn = [&](const Range &r) {
                for (int i = r.start; i < r.end; i++) {
@ -194,6 +321,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
                    fastGemm(false, seq_len, head_size, input_hidden_size,
                            1.f, input_data + input_offset, input_hidden_size,
                            packed_weight, 1.f, dst + dst_offset, head_size, opt);
+
+                    if(qkv_index < 2 && do_rotary) {
+                        // rotate on the fly
+                        const auto *rope_table_data = rope_table.ptr<const float>();
+                        rotationKernel(
+                            dst + dst_offset,
+                            rope_table_data,
+                            seq_len,
+                            qkv_head_sizes[qkv_index]
+                        );
+                    }
                }
            };

@ -281,6 +419,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
    size_t input_hidden_size;
    size_t hidden_size;

+    bool do_rotary;
    bool is_prepacked;
    std::vector<float> packed_weight_q;
    std::vector<float> packed_weight_k;
@ -293,4 +432,5 @@ Ptr<AttentionLayer> AttentionLayer::create(const LayerParams &params) {
    return makePtr<AttentionLayerImpl>(params);
 }

+
 }} // cv::dnn
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -743,6 +743,32 @@ TEST_F(Layer_RNN_Test, get_set_test)
    EXPECT_EQ(shape(outputs[1]), shape(nT, nS, nH));
 }

+TEST(Layer_MHARoPe_Test_Accuracy_with_, Pytorch)
+{
+    Mat QKV = blobFromNPY(_tf("mha_rope.QKV.npy"));
+    Mat QKV_bias = blobFromNPY(_tf("mha_rope.QKV_bias.npy"));
+    std::vector<int> qkv_hidden_sizes = { 256, 256, 256 };
+    LayerParams mhaParams;
+    mhaParams.blobs.resize(2);
+    mhaParams.blobs[0] = QKV;
+    mhaParams.blobs[1] = QKV_bias;
+    mhaParams.set("num_heads", 4);
+    mhaParams.set(
+        "qkv_hidden_sizes",
+        DictValue::arrayInt(&qkv_hidden_sizes[0], qkv_hidden_sizes.size())
+    );
+    mhaParams.set("do_rotary", true);
+
+    Ptr<AttentionLayer> layer = AttentionLayer::create(mhaParams);
+    Mat inp = blobFromNPY(_tf("mha_rope.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+    Mat h_t_reference = blobFromNPY(_tf("mha_rope.output.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
+
+
 TEST_P(Test_Caffe_layers, Accum)
 {
 #ifdef OPENCV_DNN_EXTERNAL_PROTOBUF