diff --git a/modules/dnn/src/layers/attention_layer.cpp b/modules/dnn/src/layers/attention_layer.cpp index 085ec734da..559480d599 100644 --- a/modules/dnn/src/layers/attention_layer.cpp +++ b/modules/dnn/src/layers/attention_layer.cpp @@ -83,6 +83,19 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { } else { CV_Error(Error::StsBadArg, format("DNN/Attention: invalid output dimension %zu, valid value is 2 or 3", output_ndims)); } + + const int batch_size_ = input_shape[0], seq_len_ = input_shape[1], + hidden_size_ = weight_shape.back(), + num_heads_ = static_cast(num_heads), + v_head_size_ = static_cast((hidden_size_ - qkv_hidden_sizes[0] - qkv_hidden_sizes[1]) / num_heads); + + MatShape gemm_buffer_shape{batch_size_, seq_len_, hidden_size_}, + attention_prob_shape{batch_size_ * num_heads_, seq_len_, seq_len_}, + output_buffer_shape{batch_size_ * num_heads_, seq_len_, v_head_size_}; + internals.assign(1, gemm_buffer_shape); + internals.push_back(attention_prob_shape); + internals.push_back(output_buffer_shape); + return false; } @@ -112,9 +125,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { return; } - std::vector inputs, outputs; + std::vector inputs, outputs, internals; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); + internals_arr.getMatVector(internals); // prepack weights if (!is_prepacked) { @@ -131,7 +145,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()}; size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads}; - Mat gemm_buffer = Mat::zeros(1, int(batch_size * seq_len * hidden_size), CV_32F); + // Compute Q/K/V + auto &gemm_buffer = internals[0]; auto *Q = gemm_buffer.ptr(); auto *K = Q + batch_size * seq_len * qkv_hidden_sizes[0]; auto *V = K + batch_size * seq_len * qkv_hidden_sizes[1]; @@ -177,9 +192,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { parallel_for_(Range(0, loops), fn, nstripes); } - // Compute softmax(scale * matmul(Q, K)) - std::vector attention_prob_shape{int(batch_size * num_heads), int(seq_len), int(seq_len)}; - Mat attention_prob = Mat::zeros(attention_prob_shape.size(), attention_prob_shape.data(), CV_32F); + // Compute Softmax(scale * MatMul(Q, K)) + auto &attention_prob = internals[1]; { auto *output = attention_prob.ptr(); @@ -202,12 +216,12 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer { } }, loops * seq_len * qk_head_size * seq_len * (1 / 1024.0)); - // Compute softmax - softmax(attention_prob, attention_prob, attention_prob_shape.size() - 1); + // Compute softmax on the last dimension + softmax(attention_prob, attention_prob, shape(attention_prob).size() - 1); } - // Compute np.matmul(attention_prob, V) - Mat output_buffer = Mat::zeros(1, int(batch_size * num_heads * seq_len * qkv_head_sizes[2]), CV_32F); + // Compute MatMul(attention_prob, V) + auto &output_buffer = internals[2]; { auto *output = outputs[0].ptr(); auto *output_buff = output_buffer.ptr();