mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #25076 from fengyuentau:improve_attention
dnn: try improving performance of Attention layer #25076 Checklist: - [x] Use `Mat` over `Mat::zeros` for temporary buffer in forward - [x] Use layer internal buffer over temporary Mat buffer - [x] Try a single fastGemmBatch on the Q/K/V calculation Performance: Performance test case is `Layer_Attention.VisionTransformer/0`, which has input of shape {1, 197, 768}, weight of shape {768, 2304} and bias {2304}. Data is in millisecond. | | macOS 14.2.1, Apple M1 | Ubuntu 22.04.2, Intel i7 12700K | | - | - | - | | Current | 10.96 | 1.58 | | w/ Mat | 6.27 | 1.41 | | w/ Internals | 5.87 | 1.38 | | w/ fastGemmBatch | 6.12 | 2.14 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
8f83540018
commit
5aa5c39210
@ -83,6 +83,19 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
} else {
|
||||
CV_Error(Error::StsBadArg, format("DNN/Attention: invalid output dimension %zu, valid value is 2 or 3", output_ndims));
|
||||
}
|
||||
|
||||
const int batch_size_ = input_shape[0], seq_len_ = input_shape[1],
|
||||
hidden_size_ = weight_shape.back(),
|
||||
num_heads_ = static_cast<int>(num_heads),
|
||||
v_head_size_ = static_cast<int>((hidden_size_ - qkv_hidden_sizes[0] - qkv_hidden_sizes[1]) / num_heads);
|
||||
|
||||
MatShape gemm_buffer_shape{batch_size_, seq_len_, hidden_size_},
|
||||
attention_prob_shape{batch_size_ * num_heads_, seq_len_, seq_len_},
|
||||
output_buffer_shape{batch_size_ * num_heads_, seq_len_, v_head_size_};
|
||||
internals.assign(1, gemm_buffer_shape);
|
||||
internals.push_back(attention_prob_shape);
|
||||
internals.push_back(output_buffer_shape);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -112,9 +125,10 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Mat> inputs, outputs;
|
||||
std::vector<Mat> inputs, outputs, internals;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
internals_arr.getMatVector(internals);
|
||||
|
||||
// prepack weights
|
||||
if (!is_prepacked) {
|
||||
@ -131,7 +145,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()};
|
||||
size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads};
|
||||
|
||||
Mat gemm_buffer = Mat::zeros(1, int(batch_size * seq_len * hidden_size), CV_32F);
|
||||
// Compute Q/K/V
|
||||
auto &gemm_buffer = internals[0];
|
||||
auto *Q = gemm_buffer.ptr<float>();
|
||||
auto *K = Q + batch_size * seq_len * qkv_hidden_sizes[0];
|
||||
auto *V = K + batch_size * seq_len * qkv_hidden_sizes[1];
|
||||
@ -177,9 +192,8 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
parallel_for_(Range(0, loops), fn, nstripes);
|
||||
}
|
||||
|
||||
// Compute softmax(scale * matmul(Q, K))
|
||||
std::vector<int> attention_prob_shape{int(batch_size * num_heads), int(seq_len), int(seq_len)};
|
||||
Mat attention_prob = Mat::zeros(attention_prob_shape.size(), attention_prob_shape.data(), CV_32F);
|
||||
// Compute Softmax(scale * MatMul(Q, K))
|
||||
auto &attention_prob = internals[1];
|
||||
{
|
||||
auto *output = attention_prob.ptr<float>();
|
||||
|
||||
@ -202,12 +216,12 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
|
||||
}
|
||||
}, loops * seq_len * qk_head_size * seq_len * (1 / 1024.0));
|
||||
|
||||
// Compute softmax
|
||||
softmax(attention_prob, attention_prob, attention_prob_shape.size() - 1);
|
||||
// Compute softmax on the last dimension
|
||||
softmax(attention_prob, attention_prob, shape(attention_prob).size() - 1);
|
||||
}
|
||||
|
||||
// Compute np.matmul(attention_prob, V)
|
||||
Mat output_buffer = Mat::zeros(1, int(batch_size * num_heads * seq_len * qkv_head_sizes[2]), CV_32F);
|
||||
// Compute MatMul(attention_prob, V)
|
||||
auto &output_buffer = internals[2];
|
||||
{
|
||||
auto *output = outputs[0].ptr<float>();
|
||||
auto *output_buff = output_buffer.ptr<float>();
|
||||
|
Loading…
Reference in New Issue
Block a user