Merge pull request #25238 from fengyuentau:optimized_const

dnn: avoid const layer forwarding in layer norm layer and attention layer #25238

While profiling ViTs with dnn, I found `ConstLayer` can take a proportion of the inference time, which is weird. This comes from the data copy during the inference of `ConstLayer`. There is a chance that we can improve the efficiency of data copying but the easiest and most convenient way is to avoid `ConstLayer`. This PR change the way how we handle constants in layer normalization layer and attention layer, which is storing in the layer blobs instead of making constant layers for them.

Checklists:

- [x] Backend compatibility in layer normalization layer.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2024-03-26 20:09:51 +08:00 committed by GitHub
parent 6e9dcb87c1
commit accf200408
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 150 additions and 77 deletions

View File

@ -28,11 +28,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
public:
using wrapper_type = GetCUDABackendWrapperType<T>;
LayerNormOp(csl::Stream stream_, int normalized_axis, float epsilon_, size_t loops)
LayerNormOp(csl::Stream stream_, const Mat &scale, const Mat &bias, int normalized_axis, float epsilon_, size_t loops)
: stream(std::move(stream_)), epsilon(epsilon_) {
CV_CheckGE(normalized_axis, 0, "LayerNorm/CUDA: axis needs to be normalized");
axis = static_cast<size_t>(normalized_axis);
if (!scale.empty()) {
input_scale_tensor = csl::makeTensorHeader<T>(scale);
csl::copyMatToTensor<T>(scale, input_scale_tensor, stream);
}
if (!bias.empty()) {
input_bias_tensor = csl::makeTensorHeader<T>(bias);
csl::copyMatToTensor<T>(bias, input_bias_tensor, stream);
}
csl::WorkspaceBuilder builder;
builder.require<float>(loops);
builder.require<float>(loops);
@ -43,10 +52,25 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
csl::Workspace& workspace) override {
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
auto input = input_wrapper->getView();
auto scale = scale_wrapper->getView();
csl::TensorView<T> scale;
if (input_scale_tensor.empty()) {
auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
scale = scale_wrapper->getView();
} else {
scale = csl::TensorView<T>(input_scale_tensor);
}
csl::TensorView<T> bias;
if (input_bias_tensor.empty()) {
if (inputs.size() >= 3) {
auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
bias = bias_wrapper->getView();
}
} else {
bias = csl::TensorView<T>(input_bias_tensor);
}
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
auto output = output_wrapper->getSpan();
@ -67,9 +91,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::reduce_mean_sqr_sum<T>(stream, mean, inv_stddev, input, norm_size);
kernels::compute_normalization_scale(stream, inv_stddev, mean, inv_stddev, norm_size, epsilon);
if (inputs.size() == 3) {
auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
auto bias = bias_wrapper->getView();
if (!bias.empty()) {
kernels::normalize_mean_variance_layernorm<T>(stream, output, input, scale, bias, mean, inv_stddev, norm_size);
} else {
kernels::normalize_mean_variance_layernorm<T>(stream, output, input, scale, mean, inv_stddev, norm_size);
@ -81,6 +103,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
private:
csl::Stream stream;
csl::Tensor<T> input_scale_tensor;
csl::Tensor<T> input_bias_tensor;
float epsilon;
size_t axis;

View File

@ -63,10 +63,11 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE {
CV_CheckEQ(inputs.size(), static_cast<size_t>(3), "DNN/Attention: three inputs are required");
int num_inputs = inputs.size() + blobs.size();
CV_CheckEQ(num_inputs, 3, "DNN/Attention: three inputs are required");
const auto &input_shape = inputs[0];
const auto &weight_shape = inputs[1];
const auto &bias_shape = inputs[2];
const auto &weight_shape = blobs.empty() ? inputs[1] : shape(blobs.front());
const auto &bias_shape = blobs.empty() ? inputs[2] : shape(blobs.back());
CV_CheckEQ(input_shape.size(), static_cast<size_t>(3), "DNN/Attention: invalid input dimension");
CV_CheckEQ(weight_shape.size(), static_cast<size_t>(2), "DNN/Attention: invalid weight dimension");
@ -109,10 +110,20 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
seq_len = static_cast<size_t>(input_shape[1]);
input_hidden_size = static_cast<size_t>(input_shape[2]);
const auto weight_shape = shape(inputs[1]);
const auto &weight = blobs.empty() ? inputs[1] : blobs.front();
const auto weight_shape = shape(weight);
hidden_size = weight_shape[1];
qkv_hidden_sizes[2] = hidden_size - qkv_hidden_sizes[0] - qkv_hidden_sizes[1];
qkv_head_sizes[2] = static_cast<size_t>(qkv_hidden_sizes[2] / num_heads);
if (!blobs.empty()) {
const auto *weight_data = weight.ptr<const float>();
packWeight(num_heads, qkv_head_sizes[0], input_hidden_size, weight_data, hidden_size, packed_weight_q, opt);
packWeight(num_heads, qkv_head_sizes[1], input_hidden_size, weight_data + qkv_hidden_sizes[0], hidden_size, packed_weight_k, opt);
packWeight(num_heads, qkv_head_sizes[2], input_hidden_size, weight_data + qkv_hidden_sizes[0] + qkv_hidden_sizes[1], hidden_size, packed_weight_v, opt);
is_prepacked = true;
}
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
@ -132,8 +143,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
// prepack weights
if (!is_prepacked) {
// prepack
const auto &weight = inputs[1];
const auto &weight = blobs.empty() ? inputs[1] : blobs.front();
const auto *weight_data = weight.ptr<const float>();
packWeight(num_heads, qkv_head_sizes[0], input_hidden_size, weight_data, hidden_size, packed_weight_q, opt);
packWeight(num_heads, qkv_head_sizes[1], input_hidden_size, weight_data + qkv_hidden_sizes[0], hidden_size, packed_weight_k, opt);
@ -153,7 +163,7 @@ class AttentionLayerImpl CV_FINAL : public AttentionLayer {
float *QKV[3] = {Q, K, V}; // Q, K, V: [B, N, S, H]
{
const auto &input = inputs[0];
const auto &bias = inputs[2];
const auto &bias = blobs.empty() ? inputs[2] : blobs.back();
const auto *input_data = input.ptr<const float>();
const auto *bias_data = bias.ptr<const float>();

View File

@ -31,6 +31,10 @@ namespace cv { namespace dnn {
// https://github.com/onnx/onnx/blob/main/docs/Operators.md#LayerNormalization
class LayerNormLayerImpl CV_FINAL : public LayerNormLayer
{
#ifdef HAVE_OPENCL
UMat weight_umat, bias_umat;
#endif
public:
LayerNormLayerImpl(const LayerParams& params)
{
@ -58,22 +62,24 @@ public:
std::vector<MatShape> &internals) const CV_OVERRIDE
{
// check shapes of weight and bias if existed
// inputs >= 2 (X and Weight are requested, bias is optional)
CV_Check(inputs.size(), inputs.size() >= 2 && inputs.size() <= 3, "LayerNorm: require two (x, weight) or three (x, weight, bias) inputs");
// inputs >= 2 (X and Weight are required, bias is optional)
int num_inputs = inputs.size() + blobs.size();
CV_Check(num_inputs, num_inputs >= 2 && num_inputs <= 3, "LayerNorm: require two (x, weight) or three (x, weight, bias) inputs");
auto x_shape = inputs[0];
int x_ndims = static_cast<int>(x_shape.size());
auto w_shape = inputs[1];
// Weight and bias are either constants or variable
auto w_shape = blobs.empty() ? inputs[1] : shape(blobs.front());
// if axis == last_dim, scale and b are both 1d tensor (represented as 2d mat nx1)
int w_ndims = static_cast<int>(w_shape.size());
w_ndims = (axis == x_ndims - 1 && w_ndims == 2) ? w_ndims - 1 : w_ndims;
CV_CheckEQ(x_ndims - axis, w_ndims, "LayerNorm: shape of weight does not match with given axis and shape of input");
for (int i = 0; i < w_ndims; ++i)
CV_CheckEQ(x_shape[axis+i], w_shape[i], "LayerNorm: weight dimensions does not match with input dimensions");
if (inputs.size() == static_cast<int>(3))
if (num_inputs >= 3)
{
auto b_shape = inputs[2];
auto b_shape = blobs.empty() ? inputs[2] : shape(blobs.back());
CV_CheckEQ(w_shape.size(), b_shape.size(), "LayerNorm: shape of weight does not match with shape of bias");
for (size_t i = 0; i < w_shape.size(); ++i)
CV_CheckEQ(w_shape[i], b_shape[i], "LayerNorm: bias dimensions does not match with weight dimensions");
@ -89,6 +95,11 @@ public:
const auto input_shape = shape(inputs[0]);
axis = normalize_axis(axis, static_cast<int>(input_shape.size()));
#ifdef HAVE_OPENCL
weight_umat.release();
bias_umat.release();
#endif
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
@ -110,11 +121,11 @@ public:
outputs_arr.getMatVector(outputs);
const auto &input = inputs[0];
const auto &scale = inputs[1];
const auto &scale = blobs.empty() ? inputs[1] : blobs.front();
auto &output = outputs[0];
if (inputs.size() == 3) {
const auto &bias = inputs[2];
if ((inputs.size() + blobs.size()) >= 3) {
const auto &bias = blobs.empty() ? inputs[2] : blobs.back();
fastNorm(input, scale, bias, output, epsilon, static_cast<size_t>(axis));
} else {
fastNorm(input, scale, output, epsilon, static_cast<size_t>(axis));
@ -129,7 +140,13 @@ public:
inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs);
const auto &input = inputs[0], &scale = inputs[1]; // &bias = inputs[2]; // bias is optional
const auto &input = inputs[0];
// no fp16 support
if (input.depth() == CV_16F) {
return false;
}
auto &output = outputs[0];
const auto input_shape = shape(input);
@ -137,11 +154,23 @@ public:
norm_size = static_cast<size_t>(total(input_shape, axis));
float inv_norm_size = 1.f / norm_size;
const auto &bias = inputs.size() == 3 ? inputs[2] : UMat::zeros(norm_size, 1, CV_32F);
// no fp16 support
if (input.depth() == CV_16F) {
return false;
if (weight_umat.empty()) {
if (blobs.empty()) {
weight_umat = inputs[1];
} else {
blobs.front().copyTo(weight_umat);
}
}
if (bias_umat.empty()) {
if ((inputs.size() + blobs.size()) == 3) {
if (blobs.empty()) {
bias_umat = inputs[2];
} else {
blobs.back().copyTo(bias_umat);
}
} else {
bias_umat = UMat::zeros(norm_size, 1, CV_32F);
}
}
String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
@ -179,7 +208,7 @@ public:
if (!ret) {
return false;
}
// Calculate instance norm: output = scale * (x - mean) / sqrt(var + eps) + bias
// Calculate instance norm: output = weight * (x - mean) / sqrt(var + eps) + bias
String mvn_kernel_name = format("mvn%d", num_vector);
build_opt += " -DNORM_VARIANCE -DLAYER_NORM -DKERNEL_MVN";
ocl::Kernel mvn_kernel(mvn_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
@ -192,8 +221,8 @@ public:
mvn_kernel.set(3, (float)epsilon);
mvn_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
mvn_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
mvn_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale));
mvn_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias));
mvn_kernel.set(6, ocl::KernelArg::PtrReadOnly(weight_umat));
mvn_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias_umat));
mvn_kernel.set(8, (int)1);
mvn_kernel.set(9, (float)0.f);
mvn_kernel.set(10, ocl::KernelArg::PtrWriteOnly(output));
@ -218,15 +247,7 @@ public:
CV_CheckNE(axis, static_cast<int>(input_tensor_desc->GetShape().GetDimNum() - 1), "LayerNorm: CANN does not support axis set as last axis due to 1D mat compatibility issue");
auto scale_tensor_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
auto scale_tensor_desc = scale_tensor_wrapper->getTensorDesc();
auto bias_tensor_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
auto bias_tensor_desc = bias_tensor_wrapper->getTensorDesc();
auto last_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
auto scale_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
auto bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
auto op = std::make_shared<ge::op::LayerNorm>(name);
@ -239,12 +260,34 @@ public:
// set inputs : x
op->set_input_x_by_name(*last_node, input_tensor_wrapper->name.c_str());
op->update_input_desc_x(*input_tensor_desc);
// set inputs : gamma
// set inputs : gamma & beta
if (blobs.empty()) {
auto scale_tensor_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
auto scale_tensor_desc = scale_tensor_wrapper->getTensorDesc();
auto scale_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
op->set_input_gamma_by_name(*scale_node, scale_tensor_wrapper->name.c_str());
op->update_input_desc_gamma(*scale_tensor_desc);
// set inputs : beta
if (inputs.size() == 3) {
auto bias_tensor_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
auto bias_tensor_desc = bias_tensor_wrapper->getTensorDesc();
auto bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
op->set_input_beta_by_name(*bias_node, bias_tensor_wrapper->name.c_str());
op->update_input_desc_beta(*bias_tensor_desc);
}
} else {
const auto &scale_mat = blobs.front();
const auto op_const_scale = std::make_shared<CannConstOp>(scale_mat.data, scale_mat.type(), shape(scale_mat), cv::format("%s_w", name.c_str()));
op->set_input_gamma(*(op_const_scale->getOp()));
op->update_input_desc_gamma(*(op_const_scale->getTensorDesc()));
if ((inputs.size() + blobs.size()) >= 3) {
const auto &bias_mat = blobs.back();
const auto op_const_bias = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), shape(bias_mat), cv::format("%s_b", name.c_str()));
op->set_input_beta(*(op_const_bias->getOp()));
op->update_input_desc_beta(*(op_const_bias->getTensorDesc()));
}
}
// set outputs
auto output_desc_y = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
@ -264,6 +307,7 @@ public:
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
const auto &input_shape = ieInpNode.get_shape();
std::shared_ptr<ov::Node> mvn, result;
ov::Output<ov::Node> scale, bias;
// mvn
// https://docs.openvino.ai/2023.1/openvino_docs_ops_normalization_MVN_6.html
@ -274,23 +318,33 @@ public:
mvn = std::make_shared<ov::op::v6::MVN>(ieInpNode, axes, normalize_variance, epsilon, ov::op::MVNEpsMode::INSIDE_SQRT);
// layer norm = scale * mvn + bias
auto scale = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
ov::Output<ov::Node> bias;
if (blobs.empty()) {
scale = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
if (nodes.size() == 3) {
bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
}
} else {
auto scale_mat = blobs.front();
const auto scale_shape = shape(scale_mat);
scale = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(scale_shape.begin(), scale_shape.end()), scale_mat.data);
if ((nodes.size() + blobs.size()) == 3) {
auto bias_mat = blobs.back();
const auto bias_shape = shape(bias_mat);
bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), bias_mat.data);
}
}
if (axis == -1 || axis == input_shape.size() - 1) { // special case for 1D tensor (2D mat)
std::vector<int64_t> shared_shape_v(input_shape.size(), 1);
shared_shape_v.back() = -1;
auto shared_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{shared_shape_v.size()}, shared_shape_v.data());
scale = std::make_shared<ov::op::v1::Reshape>(scale, shared_shape, true);
if (nodes.size() == 3) {
if ((nodes.size() + blobs.size()) == 3) {
bias = std::make_shared<ov::op::v1::Reshape>(bias, shared_shape, true);
}
}
result = std::make_shared<ov::op::v1::Multiply>(mvn, scale);
if (nodes.size() == 3) {
if ((nodes.size() + blobs.size()) == 3) {
result = std::make_shared<ov::op::v1::Add>(result, bias);
}
@ -308,10 +362,12 @@ public:
auto input_shape = input_wrapper->getShape();
size_t loops = static_cast<size_t>(total(input_shape, 0, axis));
return make_cuda_node<cuda4dnn::LayerNormOp>(preferableTarget, std::move(context->stream), axis, epsilon, loops);
const auto scale = blobs.empty() ? Mat() : blobs.front(),
bias = blobs.empty() ? Mat() : blobs.back();
return make_cuda_node<cuda4dnn::LayerNormOp>(preferableTarget, std::move(context->stream), scale, bias, axis, epsilon, loops);
}
#endif // HAVE_CUDA
};
Ptr<LayerNormLayer> LayerNormLayer::create(const LayerParams& params)

View File

@ -3160,18 +3160,9 @@ void ONNXImporter::parseLayerNorm(LayerParams& layerParams, const opencv_onnx::N
// constants as constant inputs
for (size_t i = 1; i < node_proto.input_size(); i++)
{
if (layer_id.find(node_proto.input(i)) == layer_id.end())
{
if (constBlobs.find(node_proto.input(i)) != constBlobs.end()) {
Mat blob = getBlob(node_proto, i);
LayerParams constParams;
constParams.name = node_proto.input(i);
constParams.type = "Const";
constParams.blobs.push_back(blob);
opencv_onnx::NodeProto proto;
proto.add_output(constParams.name);
addLayer(constParams, proto);
layerParams.blobs.push_back(blob);
}
}
@ -3935,17 +3926,9 @@ void ONNXImporter::parseAttention(LayerParams& params, const opencv_onnx::NodePr
CV_CheckEQ(param_qkv_hidden_sizes.size(), 3, "ONNXImporter/parseAttention: qkv_hidden_sizes is must and only have three elements");
for (size_t i = 1; i < node_proto.input_size(); i++) {
if (layer_id.find(node_proto.input(i)) == layer_id.end()) {
Mat tensor = getBlob(node_proto, i);
LayerParams const_params;
const_params.name = node_proto.input(i);
const_params.type = "Const";
const_params.blobs.push_back(tensor);
opencv_onnx::NodeProto proto;
proto.add_output(const_params.name);
addLayer(const_params, proto);
if (constBlobs.find(node_proto.input(i)) != constBlobs.end()) {
Mat blob = getBlob(node_proto, i);
params.blobs.push_back(blob);
}
}