diff --git a/modules/dnn/src/layers/attention_layer.cpp b/modules/dnn/src/layers/attention_layer.cpp
index 2bda1f3b18..16b343dc8d 100644
--- a/modules/dnn/src/layers/attention_layer.cpp
+++ b/modules/dnn/src/layers/attention_layer.cpp
@@ -24,6 +24,105 @@ static void packWeight(size_t num_heads, size_t head_size, size_t input_hidden_s
     }
 }
 
+
+static void rotationKernel(
+    float* data, const float* rotation_table,
+    size_t seq_len, size_t d
+)
+{
+    CV_Assert(d % 2 == 0);
+    const size_t d_half = d / 2;
+
+    double nstripes = double(seq_len) * d_half * (1.0/1024.0);
+
+    auto fn = [&](const cv::Range& range)
+    {
+        for (int t = range.start; t < range.end; ++t)
+        {
+            float* out_ptr = data + size_t(t) * d;
+            const float* table_ptr = rotation_table + size_t(t) * d;
+            size_t i = 0;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const size_t w = VTraits<v_float32>::vlanes();
+            for (; i + w <= d_half; i += w)
+            {
+                v_float32 sin_v, cos_v, x_even, x_odd;
+                v_load_deinterleave(table_ptr + 2*i, sin_v, cos_v);
+                v_load_deinterleave(out_ptr    + 2*i, x_even, x_odd);
+
+                v_float32 out_even = v_sub(v_mul(cos_v, x_even), v_mul(sin_v, x_odd));
+                v_float32 out_odd  = v_add(v_mul(sin_v, x_even), v_mul(cos_v, x_odd));
+
+                v_store_interleave(out_ptr + 2*i, out_even, out_odd);
+            }
+#endif
+            // scalar tail
+            for (; i < d_half; ++i)
+            {
+                float s  = table_ptr[2*i  ];
+                float c  = table_ptr[2*i+1];
+                float xe = out_ptr[2*i];
+                float xo = out_ptr[2*i+1];
+                out_ptr[2*i]   = xe * c - xo * s;
+                out_ptr[2*i+1] = xo * c + xe * s;
+            }
+        }
+    };
+
+    // This will spin up threads and run fn over [0, seq_len)
+    parallel_for_(cv::Range(0, int(seq_len)), fn, nstripes);
+}
+
+static void precompRotationTable(float *data,
+                                  size_t seq_len,
+                                  size_t d) {
+    // RoPE precomputation
+    // RoPE is a positional encoding method used in transformer models.
+    // It uses sine and cosine functions to encode the position of tokens in a sequence
+    // initially introduced for NLP in https://arxiv.org/pdf/2104.09864
+
+    // assume data is of shape [seq_ken,d]
+    const float  logBase = std::log(10000.0f);
+    const float  inv_d   = 1.0f / float(d);
+    const size_t d_half = d / 2;
+    for (size_t pos = 0; pos < seq_len; ++pos) {
+
+        size_t i = 0;
+        float* data_ptr = data + pos * d;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const size_t w = VTraits<v_float32>::vlanes();
+        const v_float32 v_logBase = vx_setall_f32(logBase);
+        const v_float32 v_inv_d   = vx_setall_f32(inv_d);
+        const v_float32 v_neg2    = vx_setall_f32(-2.0f);
+
+        for (; i + w <= d_half; i+=w) {
+            int idx_buf[VTraits<v_float32>::max_nlanes];
+            for (int k = 0; k < int(w); ++k)
+                idx_buf[k] = int(i + k);
+            // [i, i+1, …, i+w-1]
+            v_float32 v_idx = v_cvt_f32(vx_load(idx_buf));
+            // [10_000^(-i/d), 10_000^(-(i+1)/d), …, 10_000^(-(i+w-1)/d)]
+            v_float32 v_theta = v_exp(v_mul(v_mul(v_neg2, v_mul(v_idx, v_inv_d)), v_logBase));
+            v_theta = v_mul(vx_setall_f32(float(pos)), v_theta);
+            v_float32 sin_v, cos_v;
+            v_sincos(v_theta, sin_v, cos_v);
+            // store back with interleave
+            v_store_interleave(data_ptr + 2*i, sin_v, cos_v);
+        }
+#endif
+        // scalar tail
+        for (; i < d_half; i+=1)
+        {
+            float theta = pos * std::exp(-2.f * i * inv_d * logBase);
+            data_ptr[2*i    ] = std::sin(theta);
+            data_ptr[2*i + 1] = std::cos(theta);
+        }
+    }
+}
+
+
 // Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention
 class AttentionLayerImpl CV_FINAL : public AttentionLayer {
  public:
@@ -52,6 +151,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
 
         output_ndims = params.get<int>("output_ndims", 3);
 
+        do_rotary = params.get<bool>("do_rotary", false);
+
         is_prepacked = false;
     }
 
@@ -97,6 +198,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
         internals.push_back(attention_prob_shape);
         internals.push_back(output_buffer_shape);
 
+        if (do_rotary)
+        {
+            CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
+            const int d = qkv_head_sizes[0];
+            CV_Assert(d % 2 == 0);
+            // pick maximum of q and k head dim
+
+            MatShape rotation_table_shape{seq_len_, d};
+            internals.push_back(rotation_table_shape);
+        }
+
         return false;
     }
 
@@ -154,6 +266,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
 
         float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()};
         size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads};
+        CV_Assert(internals.size() == 3 + (do_rotary ? 1 : 0));
+
+        if (do_rotary)
+        {
+            // precompute sin/cos table
+            auto &rope_table = internals.back();
+            auto *rope_table_data = rope_table.ptr<float>();
+            // currently, support rotary embeddings only if q and k head sizes are equal
+            CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]);
+            precompRotationTable(rope_table_data, seq_len, qkv_head_sizes[0]);
+        }
 
         // Compute Q/K/V
         auto &gemm_buffer = internals[0];
@@ -167,6 +290,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
             const auto *input_data = input.ptr<const float>();
             const auto *bias_data = bias.ptr<const float>();
 
+            // If rotary is false, evaluates to internals[2], which is the output_buffer
+            // but this is not dramatic, because in case rotary is false, the table is not used
+            const auto &rope_table = internals.back();
+
             opt.multi_thread = false;
             auto fn = [&](const Range &r) {
                 for (int i = r.start; i < r.end; i++) {
@@ -194,6 +321,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
                     fastGemm(false, seq_len, head_size, input_hidden_size,
                             1.f, input_data + input_offset, input_hidden_size,
                             packed_weight, 1.f, dst + dst_offset, head_size, opt);
+
+                    if(qkv_index < 2 && do_rotary) {
+                        // rotate on the fly
+                        const auto *rope_table_data = rope_table.ptr<const float>();
+                        rotationKernel(
+                            dst + dst_offset,
+                            rope_table_data,
+                            seq_len,
+                            qkv_head_sizes[qkv_index]
+                        );
+                    }
                 }
             };
 
@@ -281,6 +419,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
     size_t input_hidden_size;
     size_t hidden_size;
 
+    bool do_rotary;
     bool is_prepacked;
     std::vector<float> packed_weight_q;
     std::vector<float> packed_weight_k;
@@ -293,4 +432,5 @@ Ptr<AttentionLayer> AttentionLayer::create(const LayerParams &params) {
     return makePtr<AttentionLayerImpl>(params);
 }
 
+
 }} // cv::dnn
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index d4d27e1eb0..17afa43b79 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -743,6 +743,32 @@ TEST_F(Layer_RNN_Test, get_set_test)
     EXPECT_EQ(shape(outputs[1]), shape(nT, nS, nH));
 }
 
+TEST(Layer_MHARoPe_Test_Accuracy_with_, Pytorch)
+{
+    Mat QKV = blobFromNPY(_tf("mha_rope.QKV.npy"));
+    Mat QKV_bias = blobFromNPY(_tf("mha_rope.QKV_bias.npy"));
+    std::vector<int> qkv_hidden_sizes = { 256, 256, 256 };
+    LayerParams mhaParams;
+    mhaParams.blobs.resize(2);
+    mhaParams.blobs[0] = QKV;
+    mhaParams.blobs[1] = QKV_bias;
+    mhaParams.set("num_heads", 4);
+    mhaParams.set(
+        "qkv_hidden_sizes",
+        DictValue::arrayInt(&qkv_hidden_sizes[0], qkv_hidden_sizes.size())
+    );
+    mhaParams.set("do_rotary", true);
+
+    Ptr<AttentionLayer> layer = AttentionLayer::create(mhaParams);
+    Mat inp = blobFromNPY(_tf("mha_rope.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+    Mat h_t_reference = blobFromNPY(_tf("mha_rope.output.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
+
+
 TEST_P(Test_Caffe_layers, Accum)
 {
 #ifdef OPENCV_DNN_EXTERNAL_PROTOBUF