opencv/modules/dnn/src/int8layers/fully_connected_layer.cpp
2023-10-13 19:23:30 +08:00

482 lines
19 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
{
namespace dnn
{
class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
{
public:
enum { VEC_ALIGN = 32 };
FullyConnectedLayerInt8Impl(const LayerParams& params)
{
setParamsFrom(params);
input_sc = params.get<float>("input_scale");
input_zp = params.get<int>("input_zeropoint");
output_zp = params.get<int>("zeropoints");
output_sc = params.get<float>("scales");
axis = params.get<int>("axis", 1);
per_channel = params.get<bool>("per_channel", true);
if (blobs.size() == 3)
{
// blobs[0] - Weights
// blobs[1] - Bias fused with offset
// blobs[2] - Multipliers for output stage
int numOutput = params.get<int>("num_output");
int innerSize = (int)blobs[0].total() / numOutput;
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
CV_Assert((size_t)numOutput == blobs[1].total());
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
int vecsize = weightsMat.cols;
if (vecsize % VEC_ALIGN != 0)
{
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
wpadding.setTo(Scalar::all(0));
weightsMat = weightsBuf.colRange(0, vecsize);
blobs[0].copyTo(weightsMat);
}
biasMat = blobs[1] = blobs[1].reshape(1, 1);
outputMultiplier = blobs[2];
}
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &) const CV_OVERRIDE
{
int numOutput, cAxis;
CV_CheckEQ(inputs.size(), (size_t)1, "");
CV_CheckEQ(blobs[0].dims, 2, "");
numOutput = blobs[0].size[0];
CV_Assert((size_t)numOutput == blobs[1].total());
cAxis = normalize_axis(axis, inputs[0]);
MatShape outShape(cAxis + 1);
for (int i = 0; i < cAxis; ++i)
outShape[i] = inputs[0][i];
outShape.back() = numOutput;
outputs.resize(1, outShape);
return false;
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
if (backendId == DNN_BACKEND_TIMVX && haveTimVX())
{
if (biasMat.empty())
return true;
else
return false;
}
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
// TODO! add activation in Fully connection.
#ifdef HAVE_TIMVX
if(preferableTarget == DNN_TARGET_NPU)
return false;
#endif
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
activ = activ_int8;
if (!activ_int8->blobs.empty())
activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
return true;
}
return false;
}
virtual Ptr<BackendNode> initTimVX(void* timVXInfo_,
const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
bool isLast) CV_OVERRIDE
{
#ifdef HAVE_TIMVX
// tvGraph Initialization.
auto timVxInfo = reinterpret_cast<TimVXInfo *>(timVXInfo_);
CV_Assert(timVxInfo);
Ptr<TimVXGraph> tvGraph = timVxInfo->getGraph();
CV_Assert(tvGraph);
Ptr<tim::vx::Graph> graph = tvGraph->graph;
int numOutput = blobs[0].size[0];
Mat weightMat = blobs[0];
std::vector<int> inputsIndex;
std::vector<int> outputsIndex;
std::vector<float> weight_scs, bias_scs;
std::vector<int32_t> weight_zps;
bias_scs.resize(numOutput);
weight_scs.resize(numOutput);
for (int i = 0; i < numOutput; i++)
{
bias_scs[i] = outputMultiplier.at<float>(i) * output_sc;
weight_scs[i] = bias_scs[i] / input_sc;
}
weight_zps.assign(numOutput, 0);
// input Tensor
auto inputWrapper = inputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
int input_index = -1, weight_index = -1, output_index = -1;
if (inputWrapper->isTensor())
{
input_index = tvGraph->getTensorIndex(inputWrapper->getTensor());
if (input_index == -1)
{
// Copy To New inputWrapper
Mat tmp = inputWrapper->getMat();
inputWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tmp));
}
}
if (!inputWrapper->isTensor() || input_index == -1)
{
Ptr<tim::vx::Quantization> tvInputQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, input_sc, input_zp));
inputWrapper->createTensor(graph,tim::vx::TensorAttribute::INPUT, tvInputQuant);
input_index = tvGraph->addWrapper(inputWrapper);
}
inputsIndex.push_back(input_index);
// weight tensor
Ptr<TimVXBackendWrapper> weightWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(weightMat));
Ptr<tim::vx::Quantization> weightQuant;
bool tvSymmetric;
tvSymmetric = getQuantType(weight_scs, numOutput);
if (tvSymmetric)
{
// TODO! fix the following issue.
// TimVX does not support the SYMMETRIC PER CHANNEL MatMul.
return Ptr<BackendNode>();
}
else
{
weightQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, weight_scs[0], 0));
}
weightWrapper->createTensor(graph,tim::vx::TensorAttribute::CONSTANT, weightQuant);
weight_index = tvGraph->addWrapper(weightWrapper);
inputsIndex.push_back(weight_index);
// Output tensor
CV_Assert(outputsWrapper.size() == 1);
Ptr<TimVXBackendWrapper> outputWrapper = outputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
Ptr<tim::vx::Quantization> outputQuant = Ptr<tim::vx::Quantization>(
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, output_sc, output_zp));
if (isLast)
{
auto shapeType = getShapeTypeFromMat(outputWrapper->getMat());
// For Graph Output tensor, we need to set tensor shape before createTensor().
outputWrapper->setTensorShape(shapeType);
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, outputQuant);
}
else
{
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, outputQuant);
}
output_index = tvGraph->addWrapper(outputWrapper);
outputsIndex.push_back(output_index);
std::shared_ptr<tim::vx::Operation> tvMatmul;
tvMatmul = graph->CreateOperation<tim::vx::ops::Matmul>(false, true);
Ptr<TimVXBackendNode> tvBackendNode = new TimVXBackendNode(tvGraph, tvMatmul, inputsIndex, outputsIndex);
return tvBackendNode;
#endif // HAVE_TIMVX
return Ptr<BackendNode>();
}
class FullyConnected : public ParallelLoopBody
{
public:
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
{
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
srcMat.type() == weights.type() && srcMat.type() == CV_8S &&
dstMat.type() == CV_32S && biasMat.type() == CV_32S &&
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols );
FullyConnected p;
p.srcMat = &srcMat;
p.weights = &weights;
p.biasMat = &biasMat;
p.outputMultiplier = &outputMultiplier;
p.activationLUT = &activationLUT;
p.dstMat = &dstMat;
p.nstripes = nstripes;
p.outZp = outZp;
p.activ = !activationLUT.empty() ? activ : 0;
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
p.useLASX = checkHardwareSupport(CPU_LASX);
parallel_for_(Range(0, nstripes), p, nstripes);
}
void operator()(const Range& r) const CV_OVERRIDE
{
int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN;
int nsamples = srcMat->rows;
int nw0 = weights->rows;
int k, vecsize = srcMat->cols;
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights->step1();
AutoBuffer<int8_t> srcbuf(vecsize_aligned + valign);
int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t)));
const int* lutptr = !activationLUT->empty() ? activationLUT->ptr<int>() : 0;
for( k = vecsize; k < vecsize_aligned; k++ )
sptr[k] = 0;
for( size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const int8_t* sptr_ = srcMat->ptr<int8_t>(sampleIdx);
const int8_t* wptr = weights->ptr<int8_t>(delta);
int* dptr = dstMat->ptr<int>(sampleIdx) + delta;
const int* biasptr = biasMat->ptr<int>() + delta;
const float* multptr = outputMultiplier->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
#if CV_TRY_AVX512_SKX
if( useAVX512 )
opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
#if CV_TRY_AVX2
if( useAVX2 )
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
#if CV_TRY_LASX
if( useLASX )
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
{
int i = 0;
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
vs2 = v_setzero_s32(), vs3 = v_setzero_s32();
v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
v_int32x4 s = v_load(biasptr + i);
v_float32x4 mult = v_load(multptr + i);
for( k = 0; k < vecsize; k += 16 )
{
v_int8x16 v = v_load_aligned(sptr + k);
vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0);
vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1);
vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2);
vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
}
s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)));
v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult)));
v_store(dptr + i, v_min(v_max(out, outmin), outmax));
}
#endif
for( ; i < nw; i++, wptr += wstep )
{
int s0 = biasptr[i];
float mult0 = multptr[i];
for( k = 0; k < vecsize; k++ )
{
int8_t v = sptr[k];
s0 += (int)v*wptr[k];
}
int out0 = outZp + (int)std::round(s0*mult0);
dptr[i] = std::min(std::max(out0, -128), 127);
}
}
if(activ)
activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw);
ofs += nw;
}
}
const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT;
const ActivationLayerInt8* activ;
Mat* dstMat;
int nstripes, outZp;
bool useAVX2;
bool useAVX512;
bool useLASX;
};
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> input, output;
inputs_arr.getMatVector(input);
outputs_arr.getMatVector(output);
int axisCan = normalize_axis(axis, input[0].dims);
int outerSize = input[0].total(0, axisCan);
Mat srcMat = input[0].reshape(1, outerSize);
Mat dstMat = output[0].reshape(1, outerSize);
Mat dstMatInt32= Mat(shape(dstMat), CV_32S);
const int nstripes = getNumThreads();
FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp);
dstMatInt32.convertTo(dstMat, CV_8S);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(inputs); // suppress unused variable warning
long flops = 0;
int innerSize = blobs[0].size[1];
for(int i = 0; i < outputs.size(); i++)
{
flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
}
return flops;
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_CheckTypeEQ(blobs[0].type(), CV_8S, ""); // weights
CV_CheckTypeEQ(blobs[1].type(), CV_32S, ""); // bias
CV_CheckTypeEQ(outputMultiplier.type(), CV_32F, "");
ngraph::Output<ngraph::Node> input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
ngraph::Output<ngraph::Node> ieWeights, ieBias, matmul;
bool transA = false, transB = true;
size_t numOutput = blobs[0].size[0];
if (nodes.size() == 2)
{
CV_Error(Error::StsNotImplemented, "");
// auto inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
// matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, transA, transB);
}
else
{
std::vector<int> shape(1 + normalize_axis(axis, input.get_shape().size()), 0);
shape[shape.size() - 1] = -1;
input = std::make_shared<ngraph::op::v1::Reshape>(
input,
std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{shape.size()}, shape.data()),
true
);
input = ngraphDequantize(input, input_sc, input_zp);
const float low = -128, high = 127;
std::vector<float> inpLows(numOutput, low);
std::vector<float> inpHighs(numOutput, high);
std::vector<float> outLows(numOutput);
std::vector<float> outHighs(numOutput);
for (int i = 0; i < numOutput; ++i) {
outLows[i] = low * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
outHighs[i] = high * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
}
std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, weight_shape, blobs[0].data);
ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpHighs.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outHighs.data()),
256 // levels
);
matmul = std::make_shared<ngraph::op::MatMul>(input, ieWeights, transA, transB);
}
if (blobs.size() > 1) {
int32_t* bias = blobs[1].ptr<int32_t>();
std::vector<float> ovBias(blobs[1].total());
for (int i = 0; i < ovBias.size(); ++i) {
ovBias[i] = (bias[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier.ptr<float>()[i] * output_sc;
}
auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
ngraph::Shape{blobs[1].total()}, ovBias.data());
matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node);
}
matmul = ngraphQuantize(matmul, output_sc, output_zp);
return new InfEngineNgraphNode(matmul);
}
#endif // HAVE_DNN_NGRAPH
Mat weightsMat, biasMat, outputMultiplier, activationLUT;
Ptr<ActivationLayerInt8> activ;
};
Ptr<InnerProductLayerInt8> InnerProductLayerInt8::create(const LayerParams& params)
{
return Ptr<InnerProductLayerInt8>(new FullyConnectedLayerInt8Impl(params));
}
}
}