Merge pull request #27238 from nklskyoy:mha-rope

Rotary positional embeddings #27238

Rotary positional embeddings let an attention block encode the relative position between tokens. Widely adopted in LLMs such as LLaMA—and equally applicable to Vision Transformers—the ONNX Runtime com.microsoft.Attention operator already exposes this via its `do_rotary` flag (see docs), and this PR adds support for that flag in OpenCV.

_Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention_

Materials:
- RoFormer paper https://arxiv.org/abs/2104.09864
- RoPe for Vision Transformer https://github.com/naver-ai/rope-vit
- llama implementation https://github.com/meta-llama/llama/blob/main/llama/model.py#L80


Merge with https://github.com/opencv/opencv_extra/pull/1250

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
nklskyoy 2025-04-28 18:47:23 +02:00 committed by GitHub
parent 3bfc408995
commit 1df06488b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 166 additions and 0 deletions

View File

@ -24,6 +24,105 @@ static void packWeight(size_t num_heads, size_t head_size, size_t input_hidden_s
}
}
static void rotationKernel(
float* data, const float* rotation_table,
size_t seq_len, size_t d
)
{
CV_Assert(d % 2 == 0);
const size_t d_half = d / 2;
double nstripes = double(seq_len) * d_half * (1.0/1024.0);
auto fn = [&](const cv::Range& range)
{
for (int t = range.start; t < range.end; ++t)
{
float* out_ptr = data + size_t(t) * d;
const float* table_ptr = rotation_table + size_t(t) * d;
size_t i = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const size_t w = VTraits<v_float32>::vlanes();
for (; i + w <= d_half; i += w)
{
v_float32 sin_v, cos_v, x_even, x_odd;
v_load_deinterleave(table_ptr + 2*i, sin_v, cos_v);
v_load_deinterleave(out_ptr + 2*i, x_even, x_odd);
v_float32 out_even = v_sub(v_mul(cos_v, x_even), v_mul(sin_v, x_odd));
v_float32 out_odd = v_add(v_mul(sin_v, x_even), v_mul(cos_v, x_odd));
v_store_interleave(out_ptr + 2*i, out_even, out_odd);
}
#endif
// scalar tail
for (; i < d_half; ++i)
{
float s = table_ptr[2*i ];
float c = table_ptr[2*i+1];
float xe = out_ptr[2*i];
float xo = out_ptr[2*i+1];
out_ptr[2*i] = xe * c - xo * s;
out_ptr[2*i+1] = xo * c + xe * s;
}
}
};
// This will spin up threads and run fn over [0, seq_len)
parallel_for_(cv::Range(0, int(seq_len)), fn, nstripes);
}
static void precompRotationTable(float *data,
size_t seq_len,
size_t d) {
// RoPE precomputation
// RoPE is a positional encoding method used in transformer models.
// It uses sine and cosine functions to encode the position of tokens in a sequence
// initially introduced for NLP in https://arxiv.org/pdf/2104.09864
// assume data is of shape [seq_ken,d]
const float logBase = std::log(10000.0f);
const float inv_d = 1.0f / float(d);
const size_t d_half = d / 2;
for (size_t pos = 0; pos < seq_len; ++pos) {
size_t i = 0;
float* data_ptr = data + pos * d;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const size_t w = VTraits<v_float32>::vlanes();
const v_float32 v_logBase = vx_setall_f32(logBase);
const v_float32 v_inv_d = vx_setall_f32(inv_d);
const v_float32 v_neg2 = vx_setall_f32(-2.0f);
for (; i + w <= d_half; i+=w) {
int idx_buf[VTraits<v_float32>::max_nlanes];
for (int k = 0; k < int(w); ++k)
idx_buf[k] = int(i + k);
// [i, i+1, …, i+w-1]
v_float32 v_idx = v_cvt_f32(vx_load(idx_buf));
// [10_000^(-i/d), 10_000^(-(i+1)/d), …, 10_000^(-(i+w-1)/d)]
v_float32 v_theta = v_exp(v_mul(v_mul(v_neg2, v_mul(v_idx, v_inv_d)), v_logBase));
v_theta = v_mul(vx_setall_f32(float(pos)), v_theta);
v_float32 sin_v, cos_v;
v_sincos(v_theta, sin_v, cos_v);
// store back with interleave
v_store_interleave(data_ptr + 2*i, sin_v, cos_v);
}
#endif
// scalar tail
for (; i < d_half; i+=1)
{
float theta = pos * std::exp(-2.f * i * inv_d * logBase);
data_ptr[2*i ] = std::sin(theta);
data_ptr[2*i + 1] = std::cos(theta);
}
}
}
// Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention
class AttentionLayerImpl CV_FINAL : public AttentionLayer {
public:
@ -52,6 +151,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
output_ndims = params.get<int>("output_ndims", 3);
do_rotary = params.get<bool>("do_rotary", false);
is_prepacked = false;
}
@ -97,6 +198,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
internals.push_back(attention_prob_shape);
internals.push_back(output_buffer_shape);
if (do_rotary)
{
CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
const int d = qkv_head_sizes[0];
CV_Assert(d % 2 == 0);
// pick maximum of q and k head dim
MatShape rotation_table_shape{seq_len_, d};
internals.push_back(rotation_table_shape);
}
return false;
}
@ -154,6 +266,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()};
size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads};
CV_Assert(internals.size() == 3 + (do_rotary ? 1 : 0));
if (do_rotary)
{
// precompute sin/cos table
auto &rope_table = internals.back();
auto *rope_table_data = rope_table.ptr<float>();
// currently, support rotary embeddings only if q and k head sizes are equal
CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
precompRotationTable(rope_table_data, seq_len, qkv_head_sizes[0]);
}
// Compute Q/K/V
auto &gemm_buffer = internals[0];
@ -167,6 +290,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
const auto *input_data = input.ptr<const float>();
const auto *bias_data = bias.ptr<const float>();
// If rotary is false, evaluates to internals[2], which is the output_buffer
// but this is not dramatic, because in case rotary is false, the table is not used
const auto &rope_table = internals.back();
opt.multi_thread = false;
auto fn = [&](const Range &r) {
for (int i = r.start; i < r.end; i++) {
@ -194,6 +321,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
fastGemm(false, seq_len, head_size, input_hidden_size,
1.f, input_data + input_offset, input_hidden_size,
packed_weight, 1.f, dst + dst_offset, head_size, opt);
if(qkv_index < 2 && do_rotary) {
// rotate on the fly
const auto *rope_table_data = rope_table.ptr<const float>();
rotationKernel(
dst + dst_offset,
rope_table_data,
seq_len,
qkv_head_sizes[qkv_index]
);
}
}
};
@ -281,6 +419,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
size_t input_hidden_size;
size_t hidden_size;
bool do_rotary;
bool is_prepacked;
std::vector<float> packed_weight_q;
std::vector<float> packed_weight_k;
@ -293,4 +432,5 @@ Ptr<AttentionLayer> AttentionLayer::create(const LayerParams &params) {
return makePtr<AttentionLayerImpl>(params);
}
}} // cv::dnn

View File

@ -743,6 +743,32 @@ TEST_F(Layer_RNN_Test, get_set_test)
EXPECT_EQ(shape(outputs[1]), shape(nT, nS, nH));
}
TEST(Layer_MHARoPe_Test_Accuracy_with_, Pytorch)
{
Mat QKV = blobFromNPY(_tf("mha_rope.QKV.npy"));
Mat QKV_bias = blobFromNPY(_tf("mha_rope.QKV_bias.npy"));
std::vector<int> qkv_hidden_sizes = { 256, 256, 256 };
LayerParams mhaParams;
mhaParams.blobs.resize(2);
mhaParams.blobs[0] = QKV;
mhaParams.blobs[1] = QKV_bias;
mhaParams.set("num_heads", 4);
mhaParams.set(
"qkv_hidden_sizes",
DictValue::arrayInt(&qkv_hidden_sizes[0], qkv_hidden_sizes.size())
);
mhaParams.set("do_rotary", true);
Ptr<AttentionLayer> layer = AttentionLayer::create(mhaParams);
Mat inp = blobFromNPY(_tf("mha_rope.input.npy"));
std::vector<Mat> inputs(1, inp), outputs;
runLayer(layer, inputs, outputs);
Mat h_t_reference = blobFromNPY(_tf("mha_rope.output.npy"));
normAssert(h_t_reference, outputs[0]);
}
TEST_P(Test_Caffe_layers, Accum)
{
#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF