mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00

DNN: avoid CV_16S usage for FP16 #24892 **Merge after**: #24918 TODO: - [x] measure performance changes - [x] optimize convertTo for OpenCL: #24918 12700K iGPU: |Name of Test|0|1|1 vs 0 (x-factor)| |---|:-:|:-:|:-:| |AlexNet::DNNTestNetwork::OCV/OCL_FP16|7.441|7.480|0.99| |CRNN::DNNTestNetwork::OCV/OCL_FP16|10.776|10.736|1.00| |DenseNet_121::DNNTestNetwork::OCV/OCL_FP16|52.762|52.833|1.00| |EAST_text_detection::DNNTestNetwork::OCV/OCL_FP16|60.694|60.721|1.00| |EfficientNet::DNNTestNetwork::OCV/OCL_FP16|33.373|33.173|1.01| |FastNeuralStyle_eccv16::DNNTestNetwork::OCV/OCL_FP16|81.840|81.724|1.00| |GoogLeNet::DNNTestNetwork::OCV/OCL_FP16|20.965|20.927|1.00| |Inception_5h::DNNTestNetwork::OCV/OCL_FP16|22.204|22.173|1.00| |Inception_v2_SSD_TensorFlow::DNNTestNetwork::OCV/OCL_FP16|47.115|47.460|0.99| |MPHand::DNNTestNetwork::OCV/OCL_FP16|6.760|6.670|1.01| |MPPalm::DNNTestNetwork::OCV/OCL_FP16|10.188|10.171|1.00| |MPPose::DNNTestNetwork::OCV/OCL_FP16|12.510|12.561|1.00| |MobileNet_SSD_Caffe::DNNTestNetwork::OCV/OCL_FP16|17.290|17.072|1.01| |MobileNet_SSD_v1_TensorFlow::DNNTestNetwork::OCV/OCL_FP16|19.473|19.306|1.01| |MobileNet_SSD_v2_TensorFlow::DNNTestNetwork::OCV/OCL_FP16|22.874|23.404|0.98| |OpenFace::DNNTestNetwork::OCV/OCL_FP16|9.568|9.517|1.01| |OpenPose_pose_mpi_faster_4_stages::DNNTestNetwork::OCV/OCL_FP16|539.899|539.845|1.00| |PPHumanSeg::DNNTestNetwork::OCV/OCL_FP16|18.015|18.769|0.96| |PPOCRv3::DNNTestNetwork::OCV/OCL_FP16|63.122|63.540|0.99| |ResNet_50::DNNTestNetwork::OCV/OCL_FP16|34.947|34.925|1.00| |SFace::DNNTestNetwork::OCV/OCL_FP16|10.249|10.206|1.00| |SSD::DNNTestNetwork::OCV/OCL_FP16|213.068|213.108|1.00| |SqueezeNet_v1_1::DNNTestNetwork::OCV/OCL_FP16|4.867|4.878|1.00| |VIT_B_32::DNNTestNetwork::OCV/OCL_FP16|200.563|190.788|1.05| |VitTrack::DNNTestNetwork::OCV/OCL_FP16|7.528|7.173|1.05| |YOLOX::DNNTestNetwork::OCV/OCL_FP16|132.858|132.701|1.00| |YOLOv3::DNNTestNetwork::OCV/OCL_FP16|209.559|208.809|1.00| |YOLOv4::DNNTestNetwork::OCV/OCL_FP16|221.357|220.924|1.00| |YOLOv4_tiny::DNNTestNetwork::OCV/OCL_FP16|24.446|24.382|1.00| |YOLOv5::DNNTestNetwork::OCV/OCL_FP16|43.922|44.080|1.00| |YOLOv8::DNNTestNetwork::OCV/OCL_FP16|64.159|63.842|1.00| |YuNet::DNNTestNetwork::OCV/OCL_FP16|10.177|10.231|0.99| |opencv_face_detector::DNNTestNetwork::OCV/OCL_FP16|15.121|15.445|0.98| Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
168 lines
5.5 KiB
C++
168 lines
5.5 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
// Copyright (C) 2018, Intel Corporation, all rights reserved.
|
|
// Third party copyrights are property of their respective owners.
|
|
#include "../precomp.hpp"
|
|
#include "../op_cuda.hpp"
|
|
|
|
#ifdef HAVE_CUDA
|
|
#include "../cuda4dnn/primitives/shuffle_channel.hpp"
|
|
using namespace cv::dnn::cuda4dnn;
|
|
#endif
|
|
|
|
namespace cv { namespace dnn {
|
|
|
|
class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
|
|
{
|
|
public:
|
|
ShuffleChannelLayerImpl(const LayerParams& params)
|
|
{
|
|
group = params.get<int>("group", 1);
|
|
setParamsFrom(params);
|
|
}
|
|
|
|
virtual bool supportBackend(int backendId) CV_OVERRIDE
|
|
{
|
|
return backendId == DNN_BACKEND_OPENCV ||
|
|
backendId == DNN_BACKEND_CUDA;
|
|
}
|
|
|
|
bool getMemoryShapes(const std::vector<MatShape> &inputs,
|
|
const int requiredOutputs,
|
|
std::vector<MatShape> &outputs,
|
|
std::vector<MatShape> &internals) const CV_OVERRIDE
|
|
{
|
|
CV_Assert(inputs.size() == 1 && inputs[0].size() == 4);
|
|
CV_Assert(inputs[0][1] % group == 0);
|
|
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
|
|
return group == 1;
|
|
}
|
|
|
|
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
|
{
|
|
if (group != 1)
|
|
{
|
|
std::vector<Mat> inputs, outputs;
|
|
inputs_arr.getMatVector(inputs);
|
|
outputs_arr.getMatVector(outputs);
|
|
|
|
LayerParams lp;
|
|
float order[] = {0, 2, 1, 3};
|
|
lp.set("order", DictValue::arrayInt(&order[0], 4));
|
|
permute = PermuteLayer::create(lp);
|
|
|
|
const Mat& inp = inputs[0];
|
|
const Mat& out = outputs[0];
|
|
|
|
permuteInpShape.resize(4);
|
|
permuteInpShape[0] = inp.size[0];
|
|
permuteInpShape[1] = group;
|
|
permuteInpShape[2] = inp.size[1] / group;
|
|
permuteInpShape[3] = inp.size[2]*inp.size[3];
|
|
|
|
permuteOutShape.resize(4);
|
|
permuteOutShape[0] = permuteInpShape[0];
|
|
permuteOutShape[1] = permuteInpShape[2];
|
|
permuteOutShape[2] = permuteInpShape[1];
|
|
permuteOutShape[3] = permuteInpShape[3];
|
|
|
|
std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
|
|
std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
|
|
permute->finalize(permuteInputs, permuteOutputs);
|
|
}
|
|
}
|
|
|
|
#ifdef HAVE_OPENCL
|
|
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
|
|
{
|
|
std::vector<UMat> inputs;
|
|
std::vector<UMat> outputs;
|
|
|
|
inps.getUMatVector(inputs);
|
|
outs.getUMatVector(outputs);
|
|
|
|
if (inputs[0].u != outputs[0].u)
|
|
{
|
|
if (!permute.empty())
|
|
{
|
|
inputs[0] = inputs[0].reshape(1, permuteInpShape.size(), &permuteInpShape[0]);
|
|
outputs[0] = outputs[0].reshape(1, permuteOutShape.size(), &permuteOutShape[0]);
|
|
permute->preferableTarget = preferableTarget;
|
|
permute->forward(inputs, outputs, internals);
|
|
}
|
|
else
|
|
inputs[0].copyTo(outputs[0]);
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
|
{
|
|
CV_TRACE_FUNCTION();
|
|
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
|
|
|
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
|
|
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
|
|
|
if (inputs_arr.depth() == CV_16F)
|
|
{
|
|
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
|
return;
|
|
}
|
|
|
|
std::vector<Mat> inputs, outputs, internals;
|
|
inputs_arr.getMatVector(inputs);
|
|
outputs_arr.getMatVector(outputs);
|
|
internals_arr.getMatVector(internals);
|
|
|
|
Mat inp = inputs[0];
|
|
Mat out = outputs[0];
|
|
if (inp.data != out.data)
|
|
{
|
|
if (!permute.empty())
|
|
{
|
|
inp = inp.reshape(1, permuteInpShape);
|
|
out = out.reshape(1, permuteOutShape);
|
|
std::vector<Mat> permuteInputs(1, inp);
|
|
std::vector<Mat> permuteOutputs(1, out);
|
|
permute->forward(permuteInputs, permuteOutputs, internals);
|
|
}
|
|
else
|
|
inp.copyTo(out);
|
|
}
|
|
}
|
|
|
|
#ifdef HAVE_CUDA
|
|
Ptr<BackendNode> initCUDA(
|
|
void *context_,
|
|
const std::vector<Ptr<BackendWrapper>>& inputs,
|
|
const std::vector<Ptr<BackendWrapper>>& outputs
|
|
) override
|
|
{
|
|
auto context = reinterpret_cast<csl::CSLContext*>(context_);
|
|
return make_cuda_node<cuda4dnn::ShuffleChannelOp>(preferableTarget, std::move(context->stream), group);
|
|
}
|
|
#endif
|
|
|
|
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
|
|
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
|
|
{
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
Ptr<PermuteLayer> permute;
|
|
std::vector<int> permuteInpShape, permuteOutShape;
|
|
};
|
|
|
|
Ptr<Layer> ShuffleChannelLayer::create(const LayerParams& params)
|
|
{
|
|
return Ptr<Layer>(new ShuffleChannelLayerImpl(params));
|
|
}
|
|
|
|
} // namespace dnn
|
|
} // namespace cv
|