opencv/modules/dnn/src/int8layers/convolution_layer.cpp

1542 lines
74 KiB
C++
Raw Normal View History

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/core/utils/logger.hpp>
#include "opencv2/core/hal/hal.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "../op_timvx.hpp"
Merge pull request #23987 from dkurt:openvino_int8_backend OpenVINO backend for INT8 models #23987 ### Pull Request Readiness Checklist TODO: - [x] DetectionOutput layer (https://github.com/opencv/opencv/pull/24069) - [x] Less FP32 fallbacks (i.e. Sigmoid, eltwise sum) - [x] Accuracy, performance tests (https://github.com/opencv/opencv/pull/24039) - [x] Single layer tests (convolution) - [x] ~~Fixes for OpenVINO 2022.1 (https://pullrequest.opencv.org/buildbot/builders/precommit_custom_linux/builds/100334)~~ Performace results for object detection model `coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite`: | backend | performance (median time) | |---|---| | OpenCV | 77.42ms | | OpenVINO 2023.0 | 10.90ms | CPU: `11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz` Serialized model per-layer stats (note that Convolution should use `*_I8` primitives if they are quantized correctly): https://gist.github.com/dkurt/7772bbf1907035441bb5454f19f0feef --- See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2023-09-28 21:24:43 +08:00
#include "../ie_ngraph.hpp"
#include <iostream>
#include <numeric>
namespace cv
{
namespace dnn
{
Merge pull request #24325 from hanliutong:rewrite Rewrite Universal Intrinsic code: float related part #24325 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API. The series of PRs is listed below: #23885 First patch, an example #23980 Core module #24058 ImgProc module, part 1 #24132 ImgProc module, part 2 #24166 ImgProc module, part 3 #24301 Features2d and calib3d module #24324 Gapi module This patch (hopefully) is the last one in the series. This patch mainly involves 3 parts 1. Add some modifications related to float (CV_SIMD_64F) 2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD` 3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....` - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here) - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct) - ./modules/imgproc/src/color_lab.cpp (Array of vector type) - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type) - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`) These algorithms will need to be redesigned to accommodate scalable backends. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
#if CV_SIMD128
static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
{
v_int16x8 a0, a1, b0, b1;
v_expand(a, a0, a1);
v_expand(b, b0, b1);
v_int32x4 t0, t1;
v_mul_expand(a0, b0, t0, t1);
2023-10-13 19:23:30 +08:00
out0 = v_add(out0, t0); out1 = v_add(out1, t1);
v_mul_expand(a1, b1, t0, t1);
2023-10-13 19:23:30 +08:00
out2 = v_add(out2, t0); out3 = v_add(out3, t1);
}
#endif
class BaseConvolutionLayerInt8Impl : public ConvolutionLayerInt8
{
public:
BaseConvolutionLayerInt8Impl(const LayerParams &params)
{
setParamsFrom(params);
2022-10-09 09:33:07 +08:00
getConvolutionKernelParams(params, kernel_size, pads_begin, pads_end, strides, dilations, padMode, adjust_pads, useWinograd);
numOutput = params.get<int>("num_output");
int ngroups = params.get<int>("group", 1);
CV_Assert(numOutput % ngroups == 0);
input_sc = params.get<float>("input_scale");
input_zp = params.get<int>("input_zeropoint");
output_zp = params.get<int>("zeropoints");
output_sc = params.get<float>("scales");
per_channel = params.get<bool>("per_channel", true);
if (kernel_size.size() == 2) {
kernel = Size(kernel_size[1], kernel_size[0]);
stride = Size(strides[1], strides[0]);
for (int i = 0; i < pads_begin.size(); i++) {
if (pads_begin[i] != pads_end[i])
CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
}
pad = Size(pads_begin[1], pads_begin[0]);
dilation = Size(dilations[1], dilations[0]);
adjustPad.height = adjust_pads[0];
adjustPad.width = adjust_pads[1];
}
for (int i = 0; i < adjust_pads.size(); i++) {
CV_Assert(adjust_pads[i] < strides[i]);
}
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
// blobs[0] - Weights (INT8)
// blobs[1] - Biases (INT32)
// blobs[2] - Multipliers for convolution output stage (FP32)
CV_Assert(!inputs.empty() && blobs.size() == 3);
MatSize weightShape = blobs[0].size;
CV_Assert(inputs[0].dims == outputs[0].dims);
if (weightShape.dims() == 3)
{
kernel_size.resize(1, kernel_size[0]);
strides.resize(1, strides[0]);
dilations.resize(1, dilations[0]);
pads_begin.resize(1, pads_begin[0]);
pads_end.resize(1, pads_end[0]);
}
CV_Assert(weightShape.dims() == kernel_size.size() + 2);
for (int i = 0; i < kernel_size.size(); i++) {
CV_Assert(weightShape[i + 2] == kernel_size[i]);
}
const Mat &input = inputs[0];
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && input.type() == CV_8S);
for (size_t i = 0; i < outputs.size(); i++)
{
CV_Assert(inputs[i].type() == input.type());
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
for (int j = 0; j < inputs[i].dims; j++) {
CV_Assert(inputs[i].size[j] == input.size[j]);
}
}
std::vector<int> inpShape;
std::vector<int> outShape;
for (int i = 2; i < inputs[0].dims; i++) {
inpShape.push_back(inputs[0].size[i]);
outShape.push_back(outputs[0].size[i]);
}
getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end);
if (pads_begin.size() == 2) {
for (int i = 0; i < pads_begin.size(); i++) {
if (pads_begin[i] != pads_end[i])
CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
}
pad = Size(pads_begin[1], pads_begin[0]);
}
}
virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0;
bool is1x1() const
{
return (kernel.height == 1 && kernel.width == 1) &&
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
{
Mat w, b;
top->getScaleShift(w, b);
if (w.empty() && b.empty())
return false;
CV_Assert((w.empty() || w.type() == CV_32F) &&
(b.empty() || b.type() == CV_32F));
float new_sc;
int new_zp;
top->getScaleZeropoint(new_sc, new_zp);
fuseWeights(w, b, new_sc);
output_sc = new_sc;
output_zp = new_zp;
return true;
}
virtual void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) = 0;
};
//TODO: simultaneously convolution and bias addition for cache optimization
class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
{
public:
enum { VEC_ALIGN = 32, DFT_TYPE = CV_8S };
Mat weightsMat;
std::vector<int> biasvec;
std::vector<float> outputMultiplier;
Mat activationLUT;
Ptr<ActivationLayerInt8> activ;
ConvolutionLayerInt8Impl(const LayerParams &params) : BaseConvolutionLayerInt8Impl(params){}
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
{
CV_Assert(!blobs.empty());
int dims = inpShape.size();
int inpD = dims == 5 ? inpShape[2] : 1;
int inpH = inpShape[dims - 2];
int inpW = inpShape.back();
int inpGroupCn = blobs[0].size[1];
int ksize = inpGroupCn * std::accumulate(kernel_size.begin(), kernel_size.end(),
1, std::multiplies<size_t>());
return shape(inpD * inpH * inpW, ksize);
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
size_t ksize = kernel_size.size();
#ifdef HAVE_TIMVX
if (backendId == DNN_BACKEND_TIMVX)
{
/* only Conv1d and Conv2d supported. */
if (ksize == 2 || ksize == 1)
return true;
return false;
}
#endif
// Only default backend and Conv1D/Conv2D/Conv3D are supported
Merge pull request #23987 from dkurt:openvino_int8_backend OpenVINO backend for INT8 models #23987 ### Pull Request Readiness Checklist TODO: - [x] DetectionOutput layer (https://github.com/opencv/opencv/pull/24069) - [x] Less FP32 fallbacks (i.e. Sigmoid, eltwise sum) - [x] Accuracy, performance tests (https://github.com/opencv/opencv/pull/24039) - [x] Single layer tests (convolution) - [x] ~~Fixes for OpenVINO 2022.1 (https://pullrequest.opencv.org/buildbot/builders/precommit_custom_linux/builds/100334)~~ Performace results for object detection model `coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite`: | backend | performance (median time) | |---|---| | OpenCV | 77.42ms | | OpenVINO 2023.0 | 10.90ms | CPU: `11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz` Serialized model per-layer stats (note that Convolution should use `*_I8` primitives if they are quantized correctly): https://gist.github.com/dkurt/7772bbf1907035441bb5454f19f0feef --- See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2023-09-28 21:24:43 +08:00
return (backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3) ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
CV_Assert(!blobs.empty());
const int* weightShape = blobs[0].size.p;
CV_Assert(blobs[1].total() == (size_t)weightShape[0]);
internals.clear();
CV_Assert(inputs.size() != 0);
std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
int outCn = weightShape[0];
std::vector<int> outShape;
outShape.push_back(inputs[0][0]);
outShape.push_back(outCn);
int inpCn = inputs[0][1];
if (padMode.empty())
{
for (int i = 0; i < inpShape.size(); i++)
outShape.push_back((inpShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1);
}
else
{
getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
}
int ngroups = inpCn / weightShape[1];
if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
CV_Error(Error::StsError, format("Number of input channels should "
"be multiple of %d but got %d", weightShape[1], inpCn));
CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
outputs.resize(1, outShape);
return false;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
BaseConvolutionLayerInt8Impl::finalize(inputs_arr, outputs_arr);
std::vector<Mat> inputs;
inputs_arr.getMatVector(inputs);
// prepare weightsMat where each row is aligned and has enough zero padding on the right to
// use vectorized (i.e. with intrinsics) loops without tail processing
Mat wm = blobs[0].reshape(1, numOutput);
if( wm.step1() % VEC_ALIGN != 0 )
{
int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
Mat wm_buffer = Mat(numOutput, newcols, wm.type());
Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
wm_padding.setTo(Scalar::all(0));
Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
wm.copyTo(wm_aligned);
wm = wm_aligned;
}
weightsMat = wm;
Mat biasMat = blobs[1];
biasvec.resize(numOutput+2);
Mat outMult = blobs[2];
outputMultiplier.resize(numOutput+2);
for(int i = 0; i < numOutput; i++ )
{
biasvec[i] = biasMat.at<int>(i);
outputMultiplier[i] = outMult.at<float>(i);
}
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
// TODO! add activation in convolution.
#ifdef HAVE_TIMVX
if (preferableTarget == DNN_TARGET_NPU)
return false;
#endif
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
activ = activ_int8;
if (!activ_int8->blobs.empty())
activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
return true;
}
return false;
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
{
return BaseConvolutionLayerInt8Impl::tryFuse(top);
}
void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) CV_OVERRIDE
{
const int outCn = weightsMat.size[0];
Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at<float>(0))) : w_;
Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at<float>(0))) : b_;
CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2,
w.empty() || outCn == w.total(), b.empty() || outCn == b.total());
for (int i = 0; i < outCn; ++i)
{
float off = outputMultiplier[i] * output_sc;
if (!w.empty())
off *= w.at<float>(i);
if (!b.empty())
biasvec[i] += (int)std::round(b.at<float>(i)/off);
outputMultiplier[i] = off/new_sc;
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
outputMultiplier[outCn] = outputMultiplier[outCn+1] = outputMultiplier[outCn-1];
}
virtual Ptr<BackendNode> initTimVX(void* timVXInfo_,
const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
bool isLast) CV_OVERRIDE
{
#ifdef HAVE_TIMVX
/* TODO :support GroupConv;
Ref:
https://github.com/VeriSilicon/TIM-VX/blob/main/docs/Operators.md#conv2d
Link Reference: https://github.com/VeriSilicon/TIM-VX/blob/main/src/tim/vx/ops/conv1d_test.cc
*/
// tvGraph Initialization.
auto timVxInfo = reinterpret_cast<TimVXInfo *>(timVXInfo_);
CV_Assert(timVxInfo);
Ptr<TimVXGraph> tvGraph = timVxInfo->getGraph();
CV_Assert(tvGraph);
Ptr<tim::vx::Graph> graph = tvGraph->graph;
Mat tvWeightMat = blobs[0];
std::vector<int> tvBiasVec;
tvBiasVec.assign(biasvec.begin(), biasvec.end() - 2);
Mat tvBiasMat(tvBiasVec);
for (int i = 0; i < numOutput; i++)
{
tvBiasVec[i] += input_zp * (cv::sum(blobs[0].row(i))[0]);
}
// Padding Type
tim::vx::PadType tvPadType;
if (padMode.empty())
{
tvPadType = tim::vx::PadType::AUTO; // TODO! check the padding type.
}
else if(padMode == "VALID")
{
tvPadType = tim::vx::PadType::VALID;
}
else if (padMode == "SAME")
{
tvPadType = tim::vx::PadType::SAME;
}
else
{
CV_Error(Error::StsError, "Unsupported padding mode in TimVXBackend!");
}
size_t ksize = kernel_size.size();
std::vector<int> inputsIndex;
std::vector<int> outputsIndex;
CV_Assert(inputsWrapper.size() == 1);
CV_Assert(ksize == 2 || ksize == 1);
std::vector<float> weight_scs, bias_scs;
std::vector<int32_t> weight_zps, bias_zps;
weight_scs.resize(numOutput);
bias_scs.resize(numOutput);
for (int i = 0; i < numOutput; i++)
{
bias_scs[i] = outputMultiplier[i] * output_sc;
weight_scs[i] = bias_scs[i] / input_sc;
}
weight_zps.assign(numOutput, 0);
bias_zps.assign(numOutput, 0);
bool tvSymmetric;
tvSymmetric = getQuantType(weight_scs, numOutput);
// input Tensor
auto inputWrapper = inputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
int input_index = -1, weight_index = -1, bias_index = -1, output_index = -1;
if (inputWrapper->isTensor())
{
input_index = tvGraph->getTensorIndex(inputWrapper->getTensor());
if (input_index == -1)
{
// Copy To New inputWrapper
Mat tmp = inputWrapper->getMat();
inputWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tmp));
}
}
if (!inputWrapper->isTensor())
{
Ptr<tim::vx::Quantization> tvInputQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, input_sc, input_zp));
inputWrapper->createTensor(graph, tim::vx::TensorAttribute::INPUT, tvInputQuant);
input_index = tvGraph->addWrapper(inputWrapper);
}
inputsIndex.push_back(input_index);
// weight Tensor
auto tvConvWeightShape = shape(tvWeightMat);
Mat tvInputMat = inputWrapper->getMat();
// calculate group value.
int group = tvInputMat.size[1] / tvWeightMat.size[1];
// TODO! It will be supported in future.
if (tvSymmetric && tvWeightMat.total() == tvConvWeightShape[0])
return Ptr<TimVXBackendNode>();
// Reverse weight shape From OpenCV NCHW to TimVX WHCN.
std::reverse(tvConvWeightShape.begin(), tvConvWeightShape.end());
Ptr<TimVXBackendWrapper> weightWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tvWeightMat));
Ptr<tim::vx::Quantization> weightQuant;
if (tvSymmetric)
{
int wtChanneldim = tvWeightMat.dims - 1;
weightQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, wtChanneldim,
weight_scs, weight_zps));
}
else
{
weightQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, weight_scs[0], 0));
}
weightWrapper->createTensor(graph,tim::vx::TensorAttribute::CONSTANT, weightQuant);
weight_index = tvGraph->addWrapper(weightWrapper);
inputsIndex.push_back(weight_index);
// Bias Tensor
Ptr<TimVXBackendWrapper> biasWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tvBiasMat));
Ptr<tim::vx::Quantization> biasQuant;
if (tvSymmetric)
{
biasQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
bias_scs, bias_zps));
}
else
{
biasQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, weight_scs[0] * input_sc, 0));
}
biasWrapper->createTensor(graph, tim::vx::TensorAttribute::CONSTANT, biasQuant);
bias_index = tvGraph->addWrapper(biasWrapper);
inputsIndex.push_back(bias_index);
// Output tensor
CV_Assert(outputsWrapper.size() == 1);
auto outputWrapper = outputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
Ptr<tim::vx::Quantization> outputQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, output_sc, output_zp));
if (isLast)
{
// From OpenCV NCHW, to TimVX WHCN
auto shapeType = getShapeTypeFromMat(outputWrapper->getMat());
// For Graph Output tensor, we need to set tensor shape before createTensor().
outputWrapper->setTensorShape(shapeType);
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, outputQuant);
}
else
{
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, outputQuant);
}
output_index = tvGraph->addWrapper(outputWrapper);
outputsIndex.push_back(output_index);
std::shared_ptr<tim::vx::Operation> tvConv;
if (ksize == 2) // for conv2d
{
int multiplier = 0;
if(group == tvConvWeightShape[3] && group != 1)
multiplier = 1;
if (group == 1 || (group == tvConvWeightShape[3] && group != 1)) // Conv2D || DeConv2D
{
if (tvPadType == tim::vx::PadType::AUTO) {
tvConv = graph->CreateOperation<tim::vx::ops::Conv2d>(
tvConvWeightShape[3], tvPadType,
std::array<uint32_t, 2>({(uint32_t) kernel_size[1], (uint32_t) kernel_size[0]}),
std::array<uint32_t, 2>({(uint32_t) strides[1], (uint32_t) strides[0]}),
std::array<uint32_t, 2>({(uint32_t) dilations[1], (uint32_t) dilations[0]}),
std::array<uint32_t, 4>({(uint32_t) pads_begin[1], (uint32_t) pads_end[1],
(uint32_t) pads_begin[0], (uint32_t) pads_end[0]}),
multiplier);
}
else
{
tvConv = graph->CreateOperation<tim::vx::ops::Conv2d>(
tvPadType,
std::array<uint32_t, 2>({(uint32_t) strides[1], (uint32_t) strides[0]}),
std::array<uint32_t, 2>({(uint32_t) dilations[1], (uint32_t) dilations[0]}),
multiplier);
}
}
else
{
// GroupedConv2d
if (tvPadType == tim::vx::PadType::AUTO)
{
tvConv = graph->CreateOperation<tim::vx::ops::GroupedConv2d>(
std::array<uint32_t, 4>({(uint32_t) pads_begin[1], (uint32_t) pads_end[1],
(uint32_t) pads_begin[0], (uint32_t) pads_end[0]}),
std::array<uint32_t, 2>({(uint32_t)strides[1], (uint32_t)strides[0]}),
std::array<uint32_t, 2>({(uint32_t)dilations[1], (uint32_t)dilations[0]}),
group);
}
else
{
tvConv = graph->CreateOperation<tim::vx::ops::GroupedConv2d>(
tvPadType,
std::array<uint32_t, 2>({(uint32_t)strides[1], (uint32_t)strides[0]}),
std::array<uint32_t, 2>({(uint32_t)dilations[1], (uint32_t)dilations[0]}),
group);
}
}
}
else
{
// for Conv1d
if (group != 1)
CV_Error( CV_StsNotImplemented, " Grouped Conv1d or Depth-Wise Conv1d are not supported by "
"TimVX Backend. Please try OpenCV Backend.");
tvConv = graph->CreateOperation<tim::vx::ops::Conv1d>(
tvConvWeightShape[2], tvPadType, (uint32_t)kernel_size[0],
(uint32_t)strides[0],(uint32_t)dilations[0],
std::array<uint32_t, 2>({(uint32_t)pads_begin[0], (uint32_t)pads_end[0]}));
}
// Create TimVXBackendNode
Ptr<TimVXBackendNode> tvBackendNode = new TimVXBackendNode(tvGraph, tvConv, inputsIndex, outputsIndex);
return tvBackendNode;
#endif // HAVE_TIMVX
return Ptr<BackendNode>();
}
Merge pull request #23987 from dkurt:openvino_int8_backend OpenVINO backend for INT8 models #23987 ### Pull Request Readiness Checklist TODO: - [x] DetectionOutput layer (https://github.com/opencv/opencv/pull/24069) - [x] Less FP32 fallbacks (i.e. Sigmoid, eltwise sum) - [x] Accuracy, performance tests (https://github.com/opencv/opencv/pull/24039) - [x] Single layer tests (convolution) - [x] ~~Fixes for OpenVINO 2022.1 (https://pullrequest.opencv.org/buildbot/builders/precommit_custom_linux/builds/100334)~~ Performace results for object detection model `coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite`: | backend | performance (median time) | |---|---| | OpenCV | 77.42ms | | OpenVINO 2023.0 | 10.90ms | CPU: `11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz` Serialized model per-layer stats (note that Convolution should use `*_I8` primitives if they are quantized correctly): https://gist.github.com/dkurt/7772bbf1907035441bb5454f19f0feef --- See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2023-09-28 21:24:43 +08:00
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_Assert(!blobs.empty());
CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
CV_CheckTypeEQ(weightsMat.type(), CV_8S, "");
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::vector<size_t> dims = ieInpNode.get_shape();
CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
CV_Assert(ieInpNode.get_element_type() == ngraph::element::f32);
ngraph::Output<ngraph::Node> ieWeights;
if (nodes.size() > 1)
ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
const int inpCn = dims[1];
const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
const int group = inpCn / inpGroupCn;
std::vector<size_t> kernel_shape;
if (group != 1)
{
kernel_shape.push_back(group);
}
kernel_shape.push_back(numOutput / group);
kernel_shape.push_back(inpCn / group);
std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
if (nodes.size() == 1)
{
ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, kernel_shape, blobs[0].data);
}
else
{
auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
ieWeights = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
}
ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
if (!padMode.empty())
pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
ieInpNode = ngraphDequantize(ieInpNode, input_sc, input_zp);
const float low = -128, high = 127;
std::vector<float> inpLows(numOutput, low);
std::vector<float> inpHighs(numOutput, high);
std::vector<float> outLows(numOutput);
std::vector<float> outHighs(numOutput);
std::vector<size_t> quantShape(kernel_shape.size(), 1);
if (group != 1)
{
quantShape[0] = group;
quantShape[1] = numOutput / group;
}
else
{
quantShape[0] = numOutput;
}
for (int i = 0; i < numOutput; ++i) {
outLows[i] = low * outputMultiplier[i] * output_sc / input_sc;
outHighs[i] = high * outputMultiplier[i] * output_sc / input_sc;
}
ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpHighs.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outHighs.data()),
256 // levels
);
ngraph::Output<ngraph::Node> conv_node;
if (group != 1) {
conv_node = std::make_shared<ngraph::op::v1::GroupConvolution>(
ieInpNode, ieWeights,
ngraph::Strides(strides),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
ngraph::Strides(dilations),
pad_type);
} else {
conv_node = std::make_shared<ngraph::op::v1::Convolution>(
ieInpNode, ieWeights,
ngraph::Strides(strides),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
ngraph::Strides(dilations),
pad_type);
}
std::vector<size_t> shape(conv_node.get_shape().size(), 1);
shape[1] = conv_node.get_shape()[1];
if (biasvec.size() || nodes.size() == 3)
{
std::shared_ptr<ngraph::Node> bias;
if (nodes.size() == 3)
{
auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
}
else
{
std::vector<float> ovBias(numOutput);
for (int i = 0; i < numOutput; ++i) {
ovBias[i] = (biasvec[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier[i] * output_sc;
}
bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ovBias.data());
}
conv_node = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
}
conv_node = ngraphQuantize(conv_node, output_sc, output_zp);
return new InfEngineNgraphNode(conv_node);
}
#endif // HAVE_DNN_NGRAPH
class ParallelConv : public cv::ParallelLoopBody
{
public:
enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
const Mat* input_;
const Mat* weights_;
Mat* output_;
int outShape[4]; // used only for conv2d
std::vector<size_t> kernel_size, pads_begin, pads_end, strides, dilations;
int ngroups_, nstripes_;
std::vector<int> ofstab_;
const std::vector<int>* biasvec_;
const Mat* activLUT_;
const ActivationLayerInt8* activ_;
bool is1x1_;
bool useAVX2;
bool useAVX512;
bool useLASX;
int blk_size_cn;
int inpZp, outZp;
const std::vector<float>* multiplier;
ParallelConv()
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
{}
static void run( const Mat& input, Mat& output, const Mat& weights, const std::vector<float>& multipliers,
const std::vector<int>& biasvec, const Mat& activLUT,
const std::vector<size_t>& kernel_size, const std::vector<size_t>& strides,
const std::vector<size_t>& pads_begin, const std::vector<size_t>& pads_end,
const std::vector<size_t>& dilations,
const ActivationLayerInt8* activ, int ngroups, int nstripes, int inp_Zp, int out_Zp)
{
size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
1, std::multiplies<size_t>());
bool isConv1D = input.dims == 3;
bool isConv2D = input.dims == 4;
bool isConv3D = input.dims == 5;
CV_CheckEQ(static_cast<int>(kernel_size.size()), input.dims - 2, "");
CV_Assert_N(input.dims == output.dims,
input.size[0] == output.size[0],
weights.rows == output.size[1],
weights.cols == (input.size[1]/ngroups)*karea,
input.type() == CV_8SC1,
output.type() == CV_32SC1,
input.type() == weights.type(),
input.isContinuous(),
output.isContinuous(),
biasvec.size() == (size_t)output.size[1]+2);
CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, "");
ParallelConv p;
p.input_ = &input;
p.weights_ = &weights;
p.output_ = &output;
int max_ind = isConv1D? 3: 4;
for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i];
p.outShape[1] /= ngroups;
p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations;
p.pads_begin = pads_begin; p.pads_end = pads_end;
p.ngroups_ = ngroups;
p.nstripes_ = nstripes;
int inpCnAll = input.size[1];
int depth = (input.dims == 5) ? input.size[2] : 1;
int width = input.size[input.dims - 1];
int height = isConv1D? 1 : input.size[input.dims - 2];
int inpCn = inpCnAll / ngroups;
p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
pads_begin[0] == 0 && pads_begin[1] == 0) ||
(isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1);
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
int kernel_d = isConv3D? kernel_size[0] : 1;
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();
int blk_size_cn0 = cvCeil(1600./(kernel_w*kernel_h));
int ncn = 32;
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
ncn *= 2;
ncn = std::min(ncn, inpCn);
p.blk_size_cn = ncn;
int dil_d = isConv3D? dilations[0] : 1;
int dil_h = isConv1D? 1 : dilations[dilations.size() - 2];
int dil_w = dilations.back();
p.inpZp = inp_Zp;
p.outZp = out_Zp;
p.multiplier = &multipliers;
p.ofstab_.resize(karea * ncn);
int* ofstab = &p.ofstab_[0];
if (isConv1D)
{
for( int k = 0; k < ncn; k++ )
for( int k_c = 0; k_c < kernel_w; k_c++ )
ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w;
}
else if (isConv2D)
{
for( int k = 0; k < ncn; k++ )
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++ )
ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
(k*height + k_r*dil_h)*width + k_c*dil_w;
}
else
{
for( int k = 0; k < ncn; k++ )
for (int k_d = 0; k_d < kernel_d; k_d++)
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++ )
ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
(k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
}
p.biasvec_ = &biasvec;
p.activLUT_ = &activLUT;
p.activ_ = !activLUT.empty() ? activ : 0;
parallel_for_(Range(0, nstripes), p, nstripes);
}
virtual void operator ()(const Range &r0) const CV_OVERRIDE
{
const int valign = ConvolutionLayerInt8Impl::VEC_ALIGN;
int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
bool isConv1D = input_->dims == 3;
bool isConv2D = input_->dims == 4;
bool isConv3D = input_->dims == 5;
int outW = output_->size[output_->dims - 1];
int outH = isConv1D? 1 : output_->size[output_->dims - 2];
int outCn = output_->size[1]/ngroups;
int depth = isConv3D? input_->size[2] : 1;
int height = isConv1D? 1 : input_->size[input_->dims - 2];
int width = input_->size[input_->dims - 1];
int inpCn = input_->size[1]/ngroups;
const int nstripes = nstripes_;
int kernel_d = isConv3D? kernel_size[0] : 1;
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();
int karea = kernel_w*kernel_h*kernel_d;
int pad_d = isConv3D? pads_begin[0] : 0;
int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2];
int pad_l = pads_begin.back();
int stride_d = isConv3D? strides[0] : 0;
int stride_h = isConv1D? 0 : strides[strides.size() - 2];
int stride_w = strides.back();
int dilation_d = isConv3D? dilations[0] : 1;
int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2];
int dilation_w = dilations.back();
int i, j, k, d;
int inpPlaneSize = (int)input_->total(2);
int outPlaneSize = (int)output_->total(2);
bool is1x1 = is1x1_;
int stripesPerSample;
int stripeSize;
Range r = r0;
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
width >= 16 + dilation_w*(kernel_w - 1);
// for now only 3x3 depth-wise convolutions are supported
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
// computing at most 1 pixel from each side can involve padding
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
pad_l <= 1 && pad_t <= 1;
if( !depthWiseConvolution && nstripes >= batchSize*2 )
{
stripesPerSample = nstripes/batchSize;
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, 8);
stripeSize = std::min(stripeSize, outPlaneSize);
}
else
{
stripesPerSample = 1;
int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
r.start *= samplesPerStripe;
r.end *= samplesPerStripe;
stripeSize = outPlaneSize;
}
const int8_t* data_inp0_ = input_->ptr<int8_t>();
const int* ofstab = &ofstab_[0];
const int8_t* wptr_orig_ = weights_->ptr<int8_t>();
size_t wstep = weights_->step1();
const int* biasptr_ = &biasvec_->at(0);
const float* multptr_ = &multiplier->at(0);
const int* lutptr_ = !activLUT_->empty() ? activLUT_->ptr<int>() : 0;
int* data_out0_ = output_->ptr<int>();
AutoBuffer<int8_t> rowbuf0_;
int8_t* rowbuf0 = 0;
bool use_rowbuf = !depthWiseConvolution;
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
// im2row buffer is not used for depth-wise convolution
if(use_rowbuf)
{
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
rowbuf0_.allocate(rowbufsz + valign);
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(int8_t)));
// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, (int8_t)inpZp, rowbufsz*sizeof(rowbuf0[0]) );
}
for( int stripe = r.start; stripe < r.end; stripe++ )
{
int subsampleIdx = stripe/stripesPerSample;
if( subsampleIdx >= batchSize )
break;
int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
const int8_t* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
int* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
int startOutCn = (subsampleIdx % ngroups)*outCn;
const int8_t* wptr_orig = wptr_orig_ + wstep*startOutCn;
const int* biasptr = biasptr_ + startOutCn;
const float* multptr = multptr_ + startOutCn;
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
{
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
int ncn = cn1 - cn0, vsz = karea*ncn;
int vsz_a = (int)alignSize(vsz, valign);
const int8_t* wptr = wptr_orig + cn0*karea;
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
{
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
int bsz = ofs1 - ofs0;
int out_d = ofs0 / (outH * outW);
int out_i = (ofs0 - out_d * outH * outW) / outW;
int out_j = ofs0 % outW;
if (depthWiseConvolution)
{
CV_Assert(out_i == 0 && out_j == 0);
int in_d = out_d * stride_d - pad_d;
const int8_t* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
int* outptr_ = data_out0 + ofs0;
#if CV_TRY_AVX2
if(useAVX2)
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else
#endif
{
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
int bias = biasptr[out_d], biasCopy;
float mult = multptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const int8_t* imgptr0 = inptr_ + in_i*width;
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
int out, out1;
// Bias has a fused offset component. bias = bias_quantized - input_zeropoint*sum_of_weights.
// In some cases below, certain weights are not used for convolution or set to zero.
// So we create a copy of bias at the start and remove the weight's components as necessary.
biasCopy = bias;
if (in_i < 0)
{
biasCopy += inpZp * (w00 + w01 + w02);
w00 = w01 = w02 = 0;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
biasCopy += inpZp * (w20 + w21 + w22);
w20 = w21 = w22 = 0;
imgptr2 = imgptr1;
}
int* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
biasCopy + inpZp*(w00 + w10 + w20);
out1 = outZp + (int)std::round(out*mult);
outptr[0] = std::min(std::max(out1, -128), 127);
out_j = 1;
}
Merge pull request #24325 from hanliutong:rewrite Rewrite Universal Intrinsic code: float related part #24325 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API. The series of PRs is listed below: #23885 First patch, an example #23980 Core module #24058 ImgProc module, part 1 #24132 ImgProc module, part 2 #24166 ImgProc module, part 3 #24301 Features2d and calib3d module #24324 Gapi module This patch (hopefully) is the last one in the series. This patch mainly involves 3 parts 1. Add some modifications related to float (CV_SIMD_64F) 2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD` 3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....` - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here) - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct) - ./modules/imgproc/src/color_lab.cpp (Array of vector type) - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type) - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`) These algorithms will need to be redesigned to accommodate scalable backends. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
#if CV_SIMD128
if( stride_w == 1 )
{
const int out_delta = 16;
v_int8x16 vw00 = v_setall_s8(w00), vw01 = v_setall_s8(w01), vw02 = v_setall_s8(w02),
vw10 = v_setall_s8(w10), vw11 = v_setall_s8(w11), vw12 = v_setall_s8(w12),
vw20 = v_setall_s8(w20), vw21 = v_setall_s8(w21), vw22 = v_setall_s8(w22);
v_int32x4 vout0, vout1, vout2, vout3, vbias = v_setall_s32(biasCopy), voutzp = v_setall_s32(outZp),
outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
v_float32x4 vmult = v_setall_f32(mult);
for( ; out_j < outW1; out_j += out_delta )
{
if (out_j + out_delta > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - out_delta;
}
int in_j = out_j * stride_w - pad_l;
v_int8x16 v00 = v_load(imgptr0 + in_j),
v01 = v_load(imgptr0 + in_j + dilation_w),
v02 = v_load(imgptr0 + in_j + dilation_w*2),
v10 = v_load(imgptr1 + in_j),
v11 = v_load(imgptr1 + in_j + dilation_w),
v12 = v_load(imgptr1 + in_j + dilation_w*2),
v20 = v_load(imgptr2 + in_j),
v21 = v_load(imgptr2 + in_j + dilation_w),
v22 = v_load(imgptr2 + in_j + dilation_w*2);
vout0 = vout1 = vout2 = vout3 = vbias;
v_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
v_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
v_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
v_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
v_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
v_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
v_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
2023-10-13 19:23:30 +08:00
vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult)));
vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult)));
vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult)));
vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult)));
vout0 = v_min(v_max(vout0, outmin), outmax);
vout1 = v_min(v_max(vout1, outmin), outmax);
vout2 = v_min(v_max(vout2, outmin), outmax);
vout3 = v_min(v_max(vout3, outmin), outmax);
v_store(outptr + out_j, vout0);
v_store(outptr + out_j + 4, vout1);
v_store(outptr + out_j + 8, vout2);
v_store(outptr + out_j + 12, vout3);
}
}
#endif
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
out1 = outZp + (int)std::round(out*mult);
outptr[out_j] = std::min(std::max(out1, -128), 127);
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
int s0 = 1, s1 = 1, s2 = 1;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0;
biasCopy += inpZp*(w00 + w10 + w20);
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0;
biasCopy += inpZp*(w01 + w11 + w21);
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0;
biasCopy += inpZp*(w02 + w12 + w22);
}
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
out1 = outZp + (int)std::round(out*mult);
outptr[out_j] = std::min(std::max(out1, -128), 127);
}
}
}
continue;
}
// do im2row for a part of input tensor
int8_t* rowbuf = rowbuf0;
if (isConv1D)
{
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
{
int delta = std::min(ofs1 - ofs, outW - out_j);
int out_j1 = out_j + delta;
int in_j = out_j * stride_w - pad_l;
const int8_t* imgptr = data_inp0 + cn0*width + in_j;
ofs += delta;
// do im2row for a part of input tensor
if( is1x1 )
{
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
{
for( k = 0; k < vsz; k++ )
rowbuf[k] = imgptr[k*inpPlaneSize];
}
}
else
{
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
{
// this condition should be true for most of the tensor elements, i.e.
// most of the time the kernel aperture is inside the tensor X-Y plane.
if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
{
for( k = 0; k < vsz; k++ )
{
int k1 = ofstab[k];
int8_t v0 = imgptr[k1];
int8_t v1 = imgptr[k1 + stride_w];
rowbuf[k] = v0;
rowbuf[k+vsz_a] = v1;
}
out_j++;
rowbuf += vsz_a;
imgptr += stride_w;
in_j += stride_w;
}
else
{
int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
// here some non-continuous sub-row of the row will not be
// filled from the tensor; we need to make sure that the uncovered
// elements are explicitly set to 0's. the easiest way is to
// set all the elements to 0's before the loop.
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
for( k = 0; k < ncn; k++ )
{
for( i = i0; i < i1; i++ )
{
int imgofs = k*width + i*dilation_w;
rowbuf[k*kernel_w + i] = imgptr[imgofs];
}
}
}
}
}
}
}
else if (isConv2D)
{
if( is1x1 && stride_w == 1 && stride_h == 1 )
{
const int8_t* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
{
if( j + 4 <= bsz )
{
k = 0;
for( ; k < vsz; k++ )
{
const int8_t* inp = imgptr + j + k*inpPlaneSize;
int8_t v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
rowbuf[k] = v0;
rowbuf[k + vsz_a] = v1;
rowbuf[k + vsz_a*2] = v2;
rowbuf[k + vsz_a*3] = v3;
}
j += 3;
rowbuf += vsz_a*3;
}
else
{
for( k = 0; k < vsz; k++ )
{
rowbuf[k] = imgptr[j + k*inpPlaneSize];
}
}
}
}
else
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
{
int delta = std::min(ofs1 - ofs, outW - out_j);
int out_j1 = out_j + delta;
int in_i = out_i * stride_h - pad_t;
int in_j = out_j * stride_w - pad_l;
const int8_t* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
ofs += delta;
// do im2row for a part of input tensor
if( is1x1 )
{
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
{
for( k = 0; k < vsz; k++ )
rowbuf[k] = imgptr[k*inpPlaneSize];
}
}
else
{
bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
{
// this condition should be true for most of the tensor elements, i.e.
// most of the time the kernel aperture is inside the tensor X-Y plane.
if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
{
for( k = 0; k < vsz; k++ )
{
int k1 = ofstab[k];
int8_t v0 = imgptr[k1];
int8_t v1 = imgptr[k1 + stride_w];
rowbuf[k] = v0;
rowbuf[k+vsz_a] = v1;
}
out_j++;
rowbuf += vsz_a;
imgptr += stride_w;
in_j += stride_w;
}
else
{
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
// here some non-continuous sub-row of the row will not be
// filled from the tensor; we need to make sure that the uncovered
// elements are explicitly set to 0's. the easiest way is to
// set all the elements to 0's before the loop.
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
for( k = 0; k < ncn; k++ )
{
for( i = i0; i < i1; i++ )
{
for( j = j0; j < j1; j++ )
{
int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
}
}
}
}
}
}
}
}
else
{
for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
{
int delta = std::min(ofs1 - ofs, outW - out_j);
int out_j1 = out_j + delta;
int in_d = out_d * stride_d - pad_d;
int in_i = out_i * stride_h - pad_t;
int in_j = out_j * stride_w - pad_l;
const int8_t* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
ofs += delta;
int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d);
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
{
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
// here some non-continuous sub-row of the row will not be
// filled from the tensor; we need to make sure that the uncovered
// elements are explicitly set to 0's. the easiest way is to
// set all the elements to 0's before the loop.
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
for( k = 0; k < ncn; k++ )
{
for ( d = d0; d < d1; d++)
{
for( i = i0; i < i1; i++ )
{
for( j = j0; j < j1; j++ )
{
int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
}
}
}
}
}
}
}
// now compute dot product of the weights
// and im2row-transformed part of the tensor
#if CV_TRY_AVX512_SKX
if(useAVX512)
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif
#if CV_TRY_AVX2
if(useAVX2)
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif
#if CV_TRY_LASX
if(useLASX)
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif
for( int i = 0; i < outCn; i += 2 )
{
const int8_t* wptr0 = wptr + i*wstep;
const int8_t* wptr1 = wptr0 + wstep;
int* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
int* outptr1 = outptr0 + outPlaneSize;
int bias0 = biasptr[i], bias1 = biasptr[i+1];
float mult0 = multptr[i], mult1 = multptr[i+1];
if( i+1 >= outCn )
{
wptr1 = wptr0;
outptr1 = outptr0;
bias1 = bias0;
mult1 = mult0;
}
int j = 0;
#if CV_SIMD128
v_int32x4 voutzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
v_float32x4 vmult0 = v_setall_f32(mult0), vmult1 = v_setall_f32(mult1);
for( ; j <= bsz - 4; j += 4 )
{
const int8_t* rptr = rowbuf0 + j*vsz_a;
v_int32x4 s0, s1;
if( cn0 == 0 )
{
s0 = v_setall_s32(bias0);
s1 = v_setall_s32(bias1);
}
else
{
s0 = v_load(outptr0 + j);
s1 = v_load(outptr1 + j);
}
v_int32x4 vs00 = v_setzero_s32(), vs01 = v_setzero_s32(),
vs02 = v_setzero_s32(), vs03 = v_setzero_s32(),
vs10 = v_setzero_s32(), vs11 = v_setzero_s32(),
vs12 = v_setzero_s32(), vs13 = v_setzero_s32();
for( k = 0; k < vsz; k += 16, rptr += 16 )
{
v_int8x16 w0 = v_load_aligned(wptr0 + k);
v_int8x16 w1 = v_load_aligned(wptr1 + k);
v_int8x16 r0 = v_load_aligned(rptr);
v_int8x16 r1 = v_load_aligned(rptr + vsz_a);
v_int8x16 r2 = v_load_aligned(rptr + vsz_a*2);
v_int8x16 r3 = v_load_aligned(rptr + vsz_a*3);
vs00 = v_dotprod_expand_fast(w0, r0, vs00);
vs01 = v_dotprod_expand_fast(w0, r1, vs01);
vs02 = v_dotprod_expand_fast(w0, r2, vs02);
vs03 = v_dotprod_expand_fast(w0, r3, vs03);
vs10 = v_dotprod_expand_fast(w1, r0, vs10);
vs11 = v_dotprod_expand_fast(w1, r1, vs11);
vs12 = v_dotprod_expand_fast(w1, r2, vs12);
vs13 = v_dotprod_expand_fast(w1, r3, vs13);
}
2023-10-13 19:23:30 +08:00
s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)));
s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)));
if( cn1 == inpCn )
{
2023-10-13 19:23:30 +08:00
s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0)));
s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1)));
s0 = v_min(v_max(s0, outmin), outmax);
s1 = v_min(v_max(s1, outmin), outmax);
}
v_store(outptr0 + j, s0);
v_store(outptr1 + j, s1);
}
#endif
for( ; j < bsz; j++ )
{
const int8_t* rptr = rowbuf0 + j*vsz_a;
int s00, s10;
if( cn0 == 0 )
{
s00 = bias0;
s10 = bias1;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
}
for( k = 0; k < vsz; k++ )
{
int8_t r0 = rptr[k];
s00 += (int)wptr0[k] * r0;
s10 += (int)wptr1[k] * r0;
}
if( cn1 == inpCn )
{
int out0 = outZp + (int)std::round(s00*mult0);
int out1 = outZp + (int)std::round(s10*mult1);
s00 = std::min(std::max(out0, -128), 127);
s10 = std::min(std::max(out1, -128), 127);
}
outptr0[j] = s00;
outptr1[j] = s10;
}
}
}
}
if( activ_ )
activ_->forwardSlice(data_out0 + stripeStart, lutptr_,
data_out0 + stripeStart, (int)(stripeEnd - stripeStart),
outPlaneSize, startOutCn, startOutCn + outCn);
}
}
};
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
#if CV_SSE3
uint32_t ftzMode = _MM_GET_FLUSH_ZERO_MODE();
uint32_t dazMode = _MM_GET_DENORMALS_ZERO_MODE();
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
/*if (inputs[0].dims > 3) {
printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
kernel.width, kernel.height, pad.width, pad.height,
stride.width, stride.height, dilation.width, dilation.height);
}
else {
printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2],
kernel.width, kernel.height, pad.width, pad.height,
stride.width, stride.height, dilation.width, dilation.height);
}*/
int inpGroupCn = blobs[0].size[1];
CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
outputs.size() == 1, inputs[0].data != outputs[0].data);
int ngroups = inputs[0].size[1] / inpGroupCn;
CV_Assert(outputs[0].size[1] % ngroups == 0);
int nstripes = std::max(getNumThreads(), 1);
Mat outputInt32 = Mat(shape(outputs[0]), CV_32S);
ParallelConv::run(inputs[0], outputInt32, weightsMat, outputMultiplier, biasvec, activationLUT, kernel_size, strides,
pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes, input_zp, output_zp);
outputInt32.convertTo(outputs[0], CV_8S);
#if CV_SSE3
_MM_SET_FLUSH_ZERO_MODE(ftzMode);
_MM_SET_DENORMALS_ZERO_MODE(dazMode);
#endif
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_Assert(inputs.size() == outputs.size());
int64 flops = 0;
int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
for (int i = 0; i < outputs.size(); i++)
{
flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
}
return flops;
}
};
Ptr<BaseConvolutionLayer> ConvolutionLayerInt8::create(const LayerParams &params)
{
return Ptr<BaseConvolutionLayer>(new ConvolutionLayerInt8Impl(params));
}
}
}