diff --git a/modules/dnn/src/layers/attention_layer.cpp b/modules/dnn/src/layers/attention_layer.cpp index 2bda1f3b18..16b343dc8d 100644 --- a/modules/dnn/src/layers/attention_layer.cpp +++ b/modules/dnn/src/layers/attention_layer.cpp @@ -24,6 +24,105 @@ static void packWeight(size_t num_heads, size_t head_size, size_t input_hidden_s } } + +static void rotationKernel( + float* data, const float* rotation_table, + size_t seq_len, size_t d +) +{ + CV_Assert(d % 2 == 0); + const size_t d_half = d / 2; + + double nstripes = double(seq_len) * d_half * (1.0/1024.0); + + auto fn = [&](const cv::Range& range) + { + for (int t = range.start; t < range.end; ++t) + { + float* out_ptr = data + size_t(t) * d; + const float* table_ptr = rotation_table + size_t(t) * d; + size_t i = 0; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + const size_t w = VTraits::vlanes(); + for (; i + w <= d_half; i += w) + { + v_float32 sin_v, cos_v, x_even, x_odd; + v_load_deinterleave(table_ptr + 2*i, sin_v, cos_v); + v_load_deinterleave(out_ptr + 2*i, x_even, x_odd); + + v_float32 out_even = v_sub(v_mul(cos_v, x_even), v_mul(sin_v, x_odd)); + v_float32 out_odd = v_add(v_mul(sin_v, x_even), v_mul(cos_v, x_odd)); + + v_store_interleave(out_ptr + 2*i, out_even, out_odd); + } +#endif + // scalar tail + for (; i < d_half; ++i) + { + float s = table_ptr[2*i ]; + float c = table_ptr[2*i+1]; + float xe = out_ptr[2*i]; + float xo = out_ptr[2*i+1]; + out_ptr[2*i] = xe * c - xo * s; + out_ptr[2*i+1] = xo * c + xe * s; + } + } + }; + + // This will spin up threads and run fn over [0, seq_len) + parallel_for_(cv::Range(0, int(seq_len)), fn, nstripes); +} + +static void precompRotationTable(float *data, + size_t seq_len, + size_t d) { + // RoPE precomputation + // RoPE is a positional encoding method used in transformer models. + // It uses sine and cosine functions to encode the position of tokens in a sequence + // initially introduced for NLP in https://arxiv.org/pdf/2104.09864 + + // assume data is of shape [seq_ken,d] + const float logBase = std::log(10000.0f); + const float inv_d = 1.0f / float(d); + const size_t d_half = d / 2; + for (size_t pos = 0; pos < seq_len; ++pos) { + + size_t i = 0; + float* data_ptr = data + pos * d; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + const size_t w = VTraits::vlanes(); + const v_float32 v_logBase = vx_setall_f32(logBase); + const v_float32 v_inv_d = vx_setall_f32(inv_d); + const v_float32 v_neg2 = vx_setall_f32(-2.0f); + + for (; i + w <= d_half; i+=w) { + int idx_buf[VTraits::max_nlanes]; + for (int k = 0; k < int(w); ++k) + idx_buf[k] = int(i + k); + // [i, i+1, …, i+w-1] + v_float32 v_idx = v_cvt_f32(vx_load(idx_buf)); + // [10_000^(-i/d), 10_000^(-(i+1)/d), …, 10_000^(-(i+w-1)/d)] + v_float32 v_theta = v_exp(v_mul(v_mul(v_neg2, v_mul(v_idx, v_inv_d)), v_logBase)); + v_theta = v_mul(vx_setall_f32(float(pos)), v_theta); + v_float32 sin_v, cos_v; + v_sincos(v_theta, sin_v, cos_v); + // store back with interleave + v_store_interleave(data_ptr + 2*i, sin_v, cos_v); + } +#endif + // scalar tail + for (; i < d_half; i+=1) + { + float theta = pos * std::exp(-2.f * i * inv_d * logBase); + data_ptr[2*i ] = std::sin(theta); + data_ptr[2*i + 1] = std::cos(theta); + } + } +} + + // Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention class AttentionLayerImpl CV_FINAL : public AttentionLayer { public: @@ -52,6 +151,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { output_ndims = params.get("output_ndims", 3); + do_rotary = params.get("do_rotary", false); + is_prepacked = false; } @@ -97,6 +198,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { internals.push_back(attention_prob_shape); internals.push_back(output_buffer_shape); + if (do_rotary) + { + CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]); + const int d = qkv_head_sizes[0]; + CV_Assert(d % 2 == 0); + // pick maximum of q and k head dim + + MatShape rotation_table_shape{seq_len_, d}; + internals.push_back(rotation_table_shape); + } + return false; } @@ -154,6 +266,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()}; size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads}; + CV_Assert(internals.size() == 3 + (do_rotary ? 1 : 0)); + + if (do_rotary) + { + // precompute sin/cos table + auto &rope_table = internals.back(); + auto *rope_table_data = rope_table.ptr(); + // currently, support rotary embeddings only if q and k head sizes are equal + CV_Assert(qkv_head_sizes[0] == qkv_head_sizes[1]); + precompRotationTable(rope_table_data, seq_len, qkv_head_sizes[0]); + } // Compute Q/K/V auto &gemm_buffer = internals[0]; @@ -167,6 +290,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { const auto *input_data = input.ptr(); const auto *bias_data = bias.ptr(); + // If rotary is false, evaluates to internals[2], which is the output_buffer + // but this is not dramatic, because in case rotary is false, the table is not used + const auto &rope_table = internals.back(); + opt.multi_thread = false; auto fn = [&](const Range &r) { for (int i = r.start; i < r.end; i++) { @@ -194,6 +321,17 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { fastGemm(false, seq_len, head_size, input_hidden_size, 1.f, input_data + input_offset, input_hidden_size, packed_weight, 1.f, dst + dst_offset, head_size, opt); + + if(qkv_index < 2 && do_rotary) { + // rotate on the fly + const auto *rope_table_data = rope_table.ptr(); + rotationKernel( + dst + dst_offset, + rope_table_data, + seq_len, + qkv_head_sizes[qkv_index] + ); + } } }; @@ -281,6 +419,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { size_t input_hidden_size; size_t hidden_size; + bool do_rotary; bool is_prepacked; std::vector packed_weight_q; std::vector packed_weight_k; @@ -293,4 +432,5 @@ Ptr AttentionLayer::create(const LayerParams ¶ms) { return makePtr(params); } + }} // cv::dnn diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index d4d27e1eb0..17afa43b79 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -743,6 +743,32 @@ TEST_F(Layer_RNN_Test, get_set_test) EXPECT_EQ(shape(outputs[1]), shape(nT, nS, nH)); } +TEST(Layer_MHARoPe_Test_Accuracy_with_, Pytorch) +{ + Mat QKV = blobFromNPY(_tf("mha_rope.QKV.npy")); + Mat QKV_bias = blobFromNPY(_tf("mha_rope.QKV_bias.npy")); + std::vector qkv_hidden_sizes = { 256, 256, 256 }; + LayerParams mhaParams; + mhaParams.blobs.resize(2); + mhaParams.blobs[0] = QKV; + mhaParams.blobs[1] = QKV_bias; + mhaParams.set("num_heads", 4); + mhaParams.set( + "qkv_hidden_sizes", + DictValue::arrayInt(&qkv_hidden_sizes[0], qkv_hidden_sizes.size()) + ); + mhaParams.set("do_rotary", true); + + Ptr layer = AttentionLayer::create(mhaParams); + Mat inp = blobFromNPY(_tf("mha_rope.input.npy")); + std::vector inputs(1, inp), outputs; + runLayer(layer, inputs, outputs); + Mat h_t_reference = blobFromNPY(_tf("mha_rope.output.npy")); + normAssert(h_t_reference, outputs[0]); +} + + + TEST_P(Test_Caffe_layers, Accum) { #ifdef OPENCV_DNN_EXTERNAL_PROTOBUF