mirror of
https://github.com/opencv/opencv.git
synced 2024-12-17 10:58:00 +08:00
a80fcacd90
Add per_tensor_quantize to int8 quantize * add per_tensor_quantize to dnn int8 module. * change api flag from perTensor to perChannel, and recognize quantize type and onnx importer. * change the default to hpp
1404 lines
67 KiB
C++
1404 lines
67 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
#include "../precomp.hpp"
|
|
#include "layers_common.hpp"
|
|
|
|
#include <opencv2/core/utils/logger.hpp>
|
|
|
|
#include "opencv2/core/hal/hal.hpp"
|
|
#include "opencv2/core/hal/intrin.hpp"
|
|
#include "../op_timvx.hpp"
|
|
#include <iostream>
|
|
#include <numeric>
|
|
|
|
namespace cv
|
|
{
|
|
namespace dnn
|
|
{
|
|
|
|
#if CV_SIMD
|
|
static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
|
|
v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
|
|
{
|
|
v_int16x8 a0, a1, b0, b1;
|
|
v_expand(a, a0, a1);
|
|
v_expand(b, b0, b1);
|
|
|
|
v_int32x4 t0, t1;
|
|
v_mul_expand(a0, b0, t0, t1);
|
|
out0 += t0; out1 += t1;
|
|
|
|
v_mul_expand(a1, b1, t0, t1);
|
|
out2 += t0; out3 += t1;
|
|
}
|
|
#endif
|
|
|
|
class BaseConvolutionLayerInt8Impl : public ConvolutionLayerInt8
|
|
{
|
|
public:
|
|
BaseConvolutionLayerInt8Impl(const LayerParams ¶ms)
|
|
{
|
|
setParamsFrom(params);
|
|
getConvolutionKernelParams(params, kernel_size, pads_begin, pads_end, strides, dilations, padMode, adjust_pads);
|
|
|
|
numOutput = params.get<int>("num_output");
|
|
int ngroups = params.get<int>("group", 1);
|
|
CV_Assert(numOutput % ngroups == 0);
|
|
|
|
input_sc = params.get<float>("input_scale");
|
|
input_zp = params.get<int>("input_zeropoint");
|
|
output_zp = params.get<int>("zeropoints");
|
|
output_sc = params.get<float>("scales");
|
|
per_channel = params.get<bool>("per_channel", true);
|
|
|
|
if (kernel_size.size() == 2) {
|
|
kernel = Size(kernel_size[1], kernel_size[0]);
|
|
stride = Size(strides[1], strides[0]);
|
|
for (int i = 0; i < pads_begin.size(); i++) {
|
|
if (pads_begin[i] != pads_end[i])
|
|
CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
|
|
}
|
|
pad = Size(pads_begin[1], pads_begin[0]);
|
|
dilation = Size(dilations[1], dilations[0]);
|
|
|
|
adjustPad.height = adjust_pads[0];
|
|
adjustPad.width = adjust_pads[1];
|
|
}
|
|
|
|
for (int i = 0; i < adjust_pads.size(); i++) {
|
|
CV_Assert(adjust_pads[i] < strides[i]);
|
|
}
|
|
}
|
|
|
|
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
|
{
|
|
std::vector<Mat> inputs, outputs;
|
|
inputs_arr.getMatVector(inputs);
|
|
outputs_arr.getMatVector(outputs);
|
|
|
|
// blobs[0] - Weights (INT8)
|
|
// blobs[1] - Biases (INT32)
|
|
// blobs[2] - Multipliers for convolution output stage (FP32)
|
|
CV_Assert(!inputs.empty() && blobs.size() == 3);
|
|
MatSize weightShape = blobs[0].size;
|
|
|
|
CV_Assert(inputs[0].dims == outputs[0].dims);
|
|
if (weightShape.dims() == 3)
|
|
{
|
|
kernel_size.assign(1, kernel_size[0]);
|
|
strides.assign(1, strides[0]);
|
|
dilations.assign(1, dilations[0]);
|
|
pads_begin.assign(1, pads_begin[0]);
|
|
pads_end.assign(1, pads_end[0]);
|
|
}
|
|
CV_Assert(weightShape.dims() == kernel_size.size() + 2);
|
|
for (int i = 0; i < kernel_size.size(); i++) {
|
|
CV_Assert(weightShape[i + 2] == kernel_size[i]);
|
|
}
|
|
|
|
const Mat &input = inputs[0];
|
|
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && input.type() == CV_8S);
|
|
for (size_t i = 0; i < outputs.size(); i++)
|
|
{
|
|
CV_Assert(inputs[i].type() == input.type());
|
|
CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
|
|
for (int j = 0; j < inputs[i].dims; j++) {
|
|
CV_Assert(inputs[i].size[j] == input.size[j]);
|
|
}
|
|
}
|
|
|
|
std::vector<int> inpShape;
|
|
std::vector<int> outShape;
|
|
for (int i = 2; i < inputs[0].dims; i++) {
|
|
inpShape.push_back(inputs[0].size[i]);
|
|
outShape.push_back(outputs[0].size[i]);
|
|
}
|
|
getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end);
|
|
if (pads_begin.size() == 2) {
|
|
for (int i = 0; i < pads_begin.size(); i++) {
|
|
if (pads_begin[i] != pads_end[i])
|
|
CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
|
|
}
|
|
pad = Size(pads_begin[1], pads_begin[0]);
|
|
}
|
|
}
|
|
|
|
virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0;
|
|
bool is1x1() const
|
|
{
|
|
return (kernel.height == 1 && kernel.width == 1) &&
|
|
(stride.height == 1 && stride.width == 1) &&
|
|
(dilation.height == 1 && dilation.width == 1);
|
|
}
|
|
|
|
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
|
|
{
|
|
Mat w, b;
|
|
top->getScaleShift(w, b);
|
|
if (w.empty() && b.empty())
|
|
return false;
|
|
|
|
CV_Assert((w.empty() || w.type() == CV_32F) &&
|
|
(b.empty() || b.type() == CV_32F));
|
|
|
|
float new_sc;
|
|
int new_zp;
|
|
top->getScaleZeropoint(new_sc, new_zp);
|
|
fuseWeights(w, b, new_sc);
|
|
output_sc = new_sc;
|
|
output_zp = new_zp;
|
|
return true;
|
|
}
|
|
|
|
virtual void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) = 0;
|
|
};
|
|
|
|
//TODO: simultaneously convolution and bias addition for cache optimization
|
|
class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
|
|
{
|
|
public:
|
|
enum { VEC_ALIGN = 32, DFT_TYPE = CV_8S };
|
|
Mat weightsMat;
|
|
std::vector<int> biasvec;
|
|
std::vector<float> outputMultiplier;
|
|
Mat activationLUT;
|
|
Ptr<ActivationLayerInt8> activ;
|
|
|
|
ConvolutionLayerInt8Impl(const LayerParams ¶ms) : BaseConvolutionLayerInt8Impl(params){}
|
|
|
|
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
|
|
{
|
|
CV_Assert(!blobs.empty());
|
|
int dims = inpShape.size();
|
|
int inpD = dims == 5 ? inpShape[2] : 1;
|
|
int inpH = inpShape[dims - 2];
|
|
int inpW = inpShape.back();
|
|
int inpGroupCn = blobs[0].size[1];
|
|
int ksize = inpGroupCn * std::accumulate(kernel_size.begin(), kernel_size.end(),
|
|
1, std::multiplies<size_t>());
|
|
return shape(inpD * inpH * inpW, ksize);
|
|
}
|
|
|
|
virtual bool supportBackend(int backendId) CV_OVERRIDE
|
|
{
|
|
size_t ksize = kernel_size.size();
|
|
|
|
#ifdef HAVE_TIMVX
|
|
if (backendId == DNN_BACKEND_TIMVX)
|
|
{
|
|
/* only Conv1d and Conv2d supported. */
|
|
if (ksize == 2 || ksize == 1)
|
|
return true;
|
|
return false;
|
|
}
|
|
#endif
|
|
// Only default backend and Conv1D/Conv2D/Conv3D are supported
|
|
return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
|
|
}
|
|
|
|
bool getMemoryShapes(const std::vector<MatShape> &inputs,
|
|
const int requiredOutputs,
|
|
std::vector<MatShape> &outputs,
|
|
std::vector<MatShape> &internals) const CV_OVERRIDE
|
|
{
|
|
CV_Assert(!blobs.empty());
|
|
const int* weightShape = blobs[0].size.p;
|
|
CV_Assert(blobs[1].total() == (size_t)weightShape[0]);
|
|
|
|
internals.clear();
|
|
|
|
CV_Assert(inputs.size() != 0);
|
|
std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
|
|
|
|
int outCn = weightShape[0];
|
|
std::vector<int> outShape;
|
|
outShape.push_back(inputs[0][0]);
|
|
outShape.push_back(outCn);
|
|
|
|
int inpCn = inputs[0][1];
|
|
if (padMode.empty())
|
|
{
|
|
for (int i = 0; i < inpShape.size(); i++)
|
|
outShape.push_back((inpShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1);
|
|
}
|
|
else
|
|
{
|
|
getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
|
|
}
|
|
|
|
int ngroups = inpCn / weightShape[1];
|
|
if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
|
|
CV_Error(Error::StsError, format("Number of input channels should "
|
|
"be multiple of %d but got %d", weightShape[1], inpCn));
|
|
CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
|
|
|
|
outputs.resize(1, outShape);
|
|
|
|
return false;
|
|
}
|
|
|
|
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
|
{
|
|
BaseConvolutionLayerInt8Impl::finalize(inputs_arr, outputs_arr);
|
|
|
|
std::vector<Mat> inputs;
|
|
inputs_arr.getMatVector(inputs);
|
|
// prepare weightsMat where each row is aligned and has enough zero padding on the right to
|
|
// use vectorized (i.e. with intrinsics) loops without tail processing
|
|
Mat wm = blobs[0].reshape(1, numOutput);
|
|
if( wm.step1() % VEC_ALIGN != 0 )
|
|
{
|
|
int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
|
|
Mat wm_buffer = Mat(numOutput, newcols, wm.type());
|
|
Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
|
|
wm_padding.setTo(Scalar::all(0));
|
|
Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
|
|
wm.copyTo(wm_aligned);
|
|
wm = wm_aligned;
|
|
}
|
|
weightsMat = wm;
|
|
|
|
Mat biasMat = blobs[1];
|
|
biasvec.resize(numOutput+2);
|
|
|
|
Mat outMult = blobs[2];
|
|
outputMultiplier.resize(numOutput+2);
|
|
for(int i = 0; i < numOutput; i++ )
|
|
{
|
|
biasvec[i] = biasMat.at<int>(i);
|
|
outputMultiplier[i] = outMult.at<float>(i);
|
|
}
|
|
}
|
|
|
|
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
|
|
{
|
|
// TODO! add activation in convolution.
|
|
#ifdef HAVE_TIMVX
|
|
if (preferableTarget == DNN_TARGET_NPU)
|
|
return false;
|
|
#endif
|
|
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
|
|
if (!activ_int8.empty())
|
|
{
|
|
activ = activ_int8;
|
|
if (!activ_int8->blobs.empty())
|
|
activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
|
|
{
|
|
return BaseConvolutionLayerInt8Impl::tryFuse(top);
|
|
}
|
|
|
|
void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) CV_OVERRIDE
|
|
{
|
|
const int outCn = weightsMat.size[0];
|
|
Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at<float>(0))) : w_;
|
|
Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at<float>(0))) : b_;
|
|
CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2,
|
|
w.empty() || outCn == w.total(), b.empty() || outCn == b.total());
|
|
|
|
for (int i = 0; i < outCn; ++i)
|
|
{
|
|
float off = outputMultiplier[i] * output_sc;
|
|
if (!w.empty())
|
|
off *= w.at<float>(i);
|
|
|
|
if (!b.empty())
|
|
biasvec[i] += (int)std::round(b.at<float>(i)/off);
|
|
|
|
outputMultiplier[i] = off/new_sc;
|
|
}
|
|
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
|
|
outputMultiplier[outCn] = outputMultiplier[outCn+1] = outputMultiplier[outCn-1];
|
|
}
|
|
|
|
virtual Ptr<BackendNode> initTimVX(void* timVXInfo_,
|
|
const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
|
|
const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
|
|
bool isLast) CV_OVERRIDE
|
|
{
|
|
#ifdef HAVE_TIMVX
|
|
/* TODO :support GroupConv;
|
|
Ref:
|
|
https://github.com/VeriSilicon/TIM-VX/blob/main/docs/Operators.md#conv2d
|
|
Link Reference: https://github.com/VeriSilicon/TIM-VX/blob/main/src/tim/vx/ops/conv1d_test.cc
|
|
*/
|
|
|
|
// tvGraph Initialization.
|
|
auto timVxInfo = reinterpret_cast<TimVXInfo *>(timVXInfo_);
|
|
CV_Assert(timVxInfo);
|
|
Ptr<TimVXGraph> tvGraph = timVxInfo->getGraph();
|
|
CV_Assert(tvGraph);
|
|
Ptr<tim::vx::Graph> graph = tvGraph->graph;
|
|
|
|
Mat tvWeightMat = blobs[0];
|
|
|
|
std::vector<int> tvBiasVec;
|
|
tvBiasVec.assign(biasvec.begin(), biasvec.end() - 2);
|
|
Mat tvBiasMat(tvBiasVec);
|
|
|
|
for (int i = 0; i < numOutput; i++)
|
|
{
|
|
tvBiasVec[i] += input_zp * (cv::sum(blobs[0].row(i))[0]);
|
|
}
|
|
|
|
// Padding Type
|
|
tim::vx::PadType tvPadType;
|
|
|
|
if (padMode.empty())
|
|
{
|
|
tvPadType = tim::vx::PadType::AUTO; // TODO! check the padding type.
|
|
}
|
|
else if(padMode == "VALID")
|
|
{
|
|
tvPadType = tim::vx::PadType::VALID;
|
|
}
|
|
else if (padMode == "SAME")
|
|
{
|
|
tvPadType = tim::vx::PadType::SAME;
|
|
}
|
|
else
|
|
{
|
|
CV_Error(Error::StsError, "Unsupported padding mode in TimVXBackend!");
|
|
}
|
|
|
|
size_t ksize = kernel_size.size();
|
|
|
|
std::vector<int> inputsIndex;
|
|
std::vector<int> outputsIndex;
|
|
|
|
CV_Assert(inputsWrapper.size() == 1);
|
|
CV_Assert(ksize == 2 || ksize == 1);
|
|
|
|
std::vector<float> weight_scs, bias_scs;
|
|
std::vector<int32_t> weight_zps, bias_zps;
|
|
|
|
weight_scs.resize(numOutput);
|
|
bias_scs.resize(numOutput);
|
|
|
|
for (int i = 0; i < numOutput; i++)
|
|
{
|
|
bias_scs[i] = outputMultiplier[i] * output_sc;
|
|
weight_scs[i] = bias_scs[i] / input_sc;
|
|
}
|
|
|
|
weight_zps.assign(numOutput, 0);
|
|
bias_zps.assign(numOutput, 0);
|
|
|
|
bool tvSymmetric;
|
|
tvSymmetric = getQuantType(weight_scs, numOutput);
|
|
|
|
// input Tensor
|
|
auto inputWrapper = inputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
|
|
int input_index = -1, weight_index = -1, bias_index = -1, output_index = -1;
|
|
|
|
if (inputWrapper->isTensor())
|
|
{
|
|
input_index = tvGraph->getTensorIndex(inputWrapper->getTensor());
|
|
if (input_index == -1)
|
|
{
|
|
// Copy To New inputWrapper
|
|
Mat tmp = inputWrapper->getMat();
|
|
inputWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tmp));
|
|
}
|
|
}
|
|
|
|
if (!inputWrapper->isTensor())
|
|
{
|
|
Ptr<tim::vx::Quantization> tvInputQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, input_sc, input_zp));
|
|
inputWrapper->createTensor(graph, tim::vx::TensorAttribute::INPUT, tvInputQuant);
|
|
input_index = tvGraph->addWrapper(inputWrapper);
|
|
}
|
|
inputsIndex.push_back(input_index);
|
|
|
|
// weight Tensor
|
|
auto tvConvWeightShape = shape(tvWeightMat);
|
|
Mat tvInputMat = inputWrapper->getMat();
|
|
// calculate group value.
|
|
int group = tvInputMat.size[1] / tvWeightMat.size[1];
|
|
|
|
// TODO! It will be supported in future.
|
|
if (tvSymmetric && tvWeightMat.total() == tvConvWeightShape[0])
|
|
return Ptr<TimVXBackendNode>();
|
|
// Reverse weight shape From OpenCV NCHW to TimVX WHCN.
|
|
std::reverse(tvConvWeightShape.begin(), tvConvWeightShape.end());
|
|
|
|
Ptr<TimVXBackendWrapper> weightWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tvWeightMat));
|
|
Ptr<tim::vx::Quantization> weightQuant;
|
|
|
|
if (tvSymmetric)
|
|
{
|
|
int wtChanneldim = tvWeightMat.dims - 1;
|
|
weightQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, wtChanneldim,
|
|
weight_scs, weight_zps));
|
|
}
|
|
else
|
|
{
|
|
weightQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, weight_scs[0], 0));
|
|
}
|
|
weightWrapper->createTensor(graph,tim::vx::TensorAttribute::CONSTANT, weightQuant);
|
|
|
|
weight_index = tvGraph->addWrapper(weightWrapper);
|
|
inputsIndex.push_back(weight_index);
|
|
|
|
// Bias Tensor
|
|
Ptr<TimVXBackendWrapper> biasWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tvBiasMat));
|
|
Ptr<tim::vx::Quantization> biasQuant;
|
|
|
|
if (tvSymmetric)
|
|
{
|
|
biasQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
|
|
bias_scs, bias_zps));
|
|
}
|
|
else
|
|
{
|
|
biasQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, weight_scs[0] * input_sc, 0));
|
|
}
|
|
|
|
biasWrapper->createTensor(graph, tim::vx::TensorAttribute::CONSTANT, biasQuant);
|
|
bias_index = tvGraph->addWrapper(biasWrapper);
|
|
inputsIndex.push_back(bias_index);
|
|
// Output tensor
|
|
CV_Assert(outputsWrapper.size() == 1);
|
|
auto outputWrapper = outputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
|
|
Ptr<tim::vx::Quantization> outputQuant = Ptr<tim::vx::Quantization>(
|
|
new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, output_sc, output_zp));
|
|
|
|
if (isLast)
|
|
{
|
|
// From OpenCV NCHW, to TimVX WHCN
|
|
auto shapeType = getShapeTypeFromMat(outputWrapper->getMat());
|
|
|
|
// For Graph Output tensor, we need to set tensor shape before createTensor().
|
|
outputWrapper->setTensorShape(shapeType);
|
|
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, outputQuant);
|
|
}
|
|
else
|
|
{
|
|
outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, outputQuant);
|
|
}
|
|
|
|
output_index = tvGraph->addWrapper(outputWrapper);
|
|
outputsIndex.push_back(output_index);
|
|
|
|
std::shared_ptr<tim::vx::Operation> tvConv;
|
|
|
|
if (ksize == 2) // for conv2d
|
|
{
|
|
int multiplier = 0;
|
|
if(group == tvConvWeightShape[3] && group != 1)
|
|
multiplier = 1;
|
|
if (group == 1 || (group == tvConvWeightShape[3] && group != 1)) // Conv2D || DeConv2D
|
|
{
|
|
if (tvPadType == tim::vx::PadType::AUTO) {
|
|
tvConv = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
|
tvConvWeightShape[3], tvPadType,
|
|
std::array<uint32_t, 2>({(uint32_t) kernel_size[1], (uint32_t) kernel_size[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t) strides[1], (uint32_t) strides[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t) dilations[1], (uint32_t) dilations[0]}),
|
|
std::array<uint32_t, 4>({(uint32_t) pads_begin[1], (uint32_t) pads_end[1],
|
|
(uint32_t) pads_begin[0], (uint32_t) pads_end[0]}),
|
|
multiplier);
|
|
}
|
|
else
|
|
{
|
|
tvConv = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
|
tvPadType,
|
|
std::array<uint32_t, 2>({(uint32_t) strides[1], (uint32_t) strides[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t) dilations[1], (uint32_t) dilations[0]}),
|
|
multiplier);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// GroupedConv2d
|
|
if (tvPadType == tim::vx::PadType::AUTO)
|
|
{
|
|
tvConv = graph->CreateOperation<tim::vx::ops::GroupedConv2d>(
|
|
std::array<uint32_t, 4>({(uint32_t) pads_begin[1], (uint32_t) pads_end[1],
|
|
(uint32_t) pads_begin[0], (uint32_t) pads_end[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t)strides[1], (uint32_t)strides[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t)dilations[1], (uint32_t)dilations[0]}),
|
|
group);
|
|
}
|
|
else
|
|
{
|
|
tvConv = graph->CreateOperation<tim::vx::ops::GroupedConv2d>(
|
|
tvPadType,
|
|
std::array<uint32_t, 2>({(uint32_t)strides[1], (uint32_t)strides[0]}),
|
|
std::array<uint32_t, 2>({(uint32_t)dilations[1], (uint32_t)dilations[0]}),
|
|
group);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// for Conv1d
|
|
if (group != 1)
|
|
CV_Error( CV_StsNotImplemented, " Grouped Conv1d or Depth-Wise Conv1d are not supported by "
|
|
"TimVX Backend. Please try OpenCV Backend.");
|
|
tvConv = graph->CreateOperation<tim::vx::ops::Conv1d>(
|
|
tvConvWeightShape[2], tvPadType, (uint32_t)kernel_size[0],
|
|
(uint32_t)strides[0],(uint32_t)dilations[0],
|
|
std::array<uint32_t, 2>({(uint32_t)pads_begin[0], (uint32_t)pads_end[0]}));
|
|
}
|
|
// Create TimVXBackendNode
|
|
Ptr<TimVXBackendNode> tvBackendNode = new TimVXBackendNode(tvGraph, tvConv, inputsIndex, outputsIndex);
|
|
|
|
return tvBackendNode;
|
|
#endif // HAVE_TIMVX
|
|
return Ptr<BackendNode>();
|
|
}
|
|
|
|
class ParallelConv : public cv::ParallelLoopBody
|
|
{
|
|
public:
|
|
enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
|
|
|
|
const Mat* input_;
|
|
const Mat* weights_;
|
|
Mat* output_;
|
|
int outShape[4]; // used only for conv2d
|
|
std::vector<size_t> kernel_size, pads_begin, pads_end, strides, dilations;
|
|
int ngroups_, nstripes_;
|
|
std::vector<int> ofstab_;
|
|
const std::vector<int>* biasvec_;
|
|
const Mat* activLUT_;
|
|
const ActivationLayerInt8* activ_;
|
|
bool is1x1_;
|
|
bool useAVX2;
|
|
bool useAVX512;
|
|
int blk_size_cn;
|
|
int inpZp, outZp;
|
|
const std::vector<float>* multiplier;
|
|
|
|
ParallelConv()
|
|
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
|
|
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
|
|
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
|
|
{}
|
|
|
|
static void run( const Mat& input, Mat& output, const Mat& weights, const std::vector<float>& multipliers,
|
|
const std::vector<int>& biasvec, const Mat& activLUT,
|
|
const std::vector<size_t>& kernel_size, const std::vector<size_t>& strides,
|
|
const std::vector<size_t>& pads_begin, const std::vector<size_t>& pads_end,
|
|
const std::vector<size_t>& dilations,
|
|
const ActivationLayerInt8* activ, int ngroups, int nstripes, int inp_Zp, int out_Zp)
|
|
{
|
|
size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
|
|
1, std::multiplies<size_t>());
|
|
bool isConv1D = input.dims == 3;
|
|
bool isConv2D = input.dims == 4;
|
|
bool isConv3D = input.dims == 5;
|
|
CV_CheckEQ(static_cast<int>(kernel_size.size()), input.dims - 2, "");
|
|
CV_Assert_N(input.dims == output.dims,
|
|
input.size[0] == output.size[0],
|
|
weights.rows == output.size[1],
|
|
weights.cols == (input.size[1]/ngroups)*karea,
|
|
input.type() == CV_8SC1,
|
|
output.type() == CV_32SC1,
|
|
input.type() == weights.type(),
|
|
input.isContinuous(),
|
|
output.isContinuous(),
|
|
biasvec.size() == (size_t)output.size[1]+2);
|
|
CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, "");
|
|
ParallelConv p;
|
|
|
|
p.input_ = &input;
|
|
p.weights_ = &weights;
|
|
p.output_ = &output;
|
|
int max_ind = isConv1D? 3: 4;
|
|
for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i];
|
|
p.outShape[1] /= ngroups;
|
|
|
|
p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations;
|
|
p.pads_begin = pads_begin; p.pads_end = pads_end;
|
|
|
|
p.ngroups_ = ngroups;
|
|
p.nstripes_ = nstripes;
|
|
|
|
int inpCnAll = input.size[1];
|
|
int depth = (input.dims == 5) ? input.size[2] : 1;
|
|
int width = input.size[input.dims - 1];
|
|
int height = isConv1D? 1 : input.size[input.dims - 2];
|
|
int inpCn = inpCnAll / ngroups;
|
|
|
|
p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
|
|
pads_begin[0] == 0 && pads_begin[1] == 0) ||
|
|
(isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1);
|
|
|
|
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
|
|
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
|
|
|
|
int kernel_d = isConv3D? kernel_size[0] : 1;
|
|
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
|
|
int kernel_w = kernel_size.back();
|
|
|
|
int blk_size_cn0 = cvCeil(1600./(kernel_w*kernel_h));
|
|
int ncn = 32;
|
|
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
|
|
ncn *= 2;
|
|
ncn = std::min(ncn, inpCn);
|
|
p.blk_size_cn = ncn;
|
|
|
|
int dil_d = isConv3D? dilations[0] : 1;
|
|
int dil_h = isConv1D? 1 : dilations[dilations.size() - 2];
|
|
int dil_w = dilations.back();
|
|
|
|
p.inpZp = inp_Zp;
|
|
p.outZp = out_Zp;
|
|
p.multiplier = &multipliers;
|
|
|
|
p.ofstab_.resize(karea * ncn);
|
|
int* ofstab = &p.ofstab_[0];
|
|
|
|
if (isConv1D)
|
|
{
|
|
for( int k = 0; k < ncn; k++ )
|
|
for( int k_c = 0; k_c < kernel_w; k_c++ )
|
|
ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w;
|
|
}
|
|
else if (isConv2D)
|
|
{
|
|
for( int k = 0; k < ncn; k++ )
|
|
for( int k_r = 0; k_r < kernel_h; k_r++ )
|
|
for( int k_c = 0; k_c < kernel_w; k_c++ )
|
|
ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
|
|
(k*height + k_r*dil_h)*width + k_c*dil_w;
|
|
}
|
|
else
|
|
{
|
|
for( int k = 0; k < ncn; k++ )
|
|
for (int k_d = 0; k_d < kernel_d; k_d++)
|
|
for( int k_r = 0; k_r < kernel_h; k_r++ )
|
|
for( int k_c = 0; k_c < kernel_w; k_c++ )
|
|
ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
|
|
(k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
|
|
}
|
|
|
|
p.biasvec_ = &biasvec;
|
|
p.activLUT_ = &activLUT;
|
|
p.activ_ = !activLUT.empty() ? activ : 0;
|
|
|
|
parallel_for_(Range(0, nstripes), p, nstripes);
|
|
}
|
|
|
|
virtual void operator ()(const Range &r0) const CV_OVERRIDE
|
|
{
|
|
const int valign = ConvolutionLayerInt8Impl::VEC_ALIGN;
|
|
int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
|
|
bool isConv1D = input_->dims == 3;
|
|
bool isConv2D = input_->dims == 4;
|
|
bool isConv3D = input_->dims == 5;
|
|
|
|
int outW = output_->size[output_->dims - 1];
|
|
int outH = isConv1D? 1 : output_->size[output_->dims - 2];
|
|
int outCn = output_->size[1]/ngroups;
|
|
|
|
int depth = isConv3D? input_->size[2] : 1;
|
|
int height = isConv1D? 1 : input_->size[input_->dims - 2];
|
|
int width = input_->size[input_->dims - 1];
|
|
int inpCn = input_->size[1]/ngroups;
|
|
|
|
const int nstripes = nstripes_;
|
|
|
|
int kernel_d = isConv3D? kernel_size[0] : 1;
|
|
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
|
|
int kernel_w = kernel_size.back();
|
|
int karea = kernel_w*kernel_h*kernel_d;
|
|
|
|
int pad_d = isConv3D? pads_begin[0] : 0;
|
|
int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2];
|
|
int pad_l = pads_begin.back();
|
|
|
|
int stride_d = isConv3D? strides[0] : 0;
|
|
int stride_h = isConv1D? 0 : strides[strides.size() - 2];
|
|
int stride_w = strides.back();
|
|
|
|
int dilation_d = isConv3D? dilations[0] : 1;
|
|
int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2];
|
|
int dilation_w = dilations.back();
|
|
|
|
int i, j, k, d;
|
|
int inpPlaneSize = (int)input_->total(2);
|
|
int outPlaneSize = (int)output_->total(2);
|
|
bool is1x1 = is1x1_;
|
|
|
|
int stripesPerSample;
|
|
int stripeSize;
|
|
Range r = r0;
|
|
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
|
|
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
|
|
width >= 16 + dilation_w*(kernel_w - 1);
|
|
// for now only 3x3 depth-wise convolutions are supported
|
|
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
|
|
// computing at most 1 pixel from each side can involve padding
|
|
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
|
|
pad_l <= 1 && pad_t <= 1;
|
|
|
|
if( !depthWiseConvolution && nstripes >= batchSize*2 )
|
|
{
|
|
stripesPerSample = nstripes/batchSize;
|
|
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, 8);
|
|
stripeSize = std::min(stripeSize, outPlaneSize);
|
|
}
|
|
else
|
|
{
|
|
stripesPerSample = 1;
|
|
int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
|
|
r.start *= samplesPerStripe;
|
|
r.end *= samplesPerStripe;
|
|
stripeSize = outPlaneSize;
|
|
}
|
|
|
|
const int8_t* data_inp0_ = input_->ptr<int8_t>();
|
|
const int* ofstab = &ofstab_[0];
|
|
const int8_t* wptr_orig_ = weights_->ptr<int8_t>();
|
|
size_t wstep = weights_->step1();
|
|
const int* biasptr_ = &biasvec_->at(0);
|
|
const float* multptr_ = &multiplier->at(0);
|
|
const int* lutptr_ = !activLUT_->empty() ? activLUT_->ptr<int>() : 0;
|
|
int* data_out0_ = output_->ptr<int>();
|
|
AutoBuffer<int8_t> rowbuf0_;
|
|
int8_t* rowbuf0 = 0;
|
|
bool use_rowbuf = !depthWiseConvolution;
|
|
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
|
|
|
|
// im2row buffer is not used for depth-wise convolution
|
|
if(use_rowbuf)
|
|
{
|
|
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
|
|
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
|
|
rowbuf0_.allocate(rowbufsz + valign);
|
|
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(int8_t)));
|
|
// we clear the buffer once; ultimately, it lets us to avoid
|
|
// tail processing after running the unrolled/vectorized loop.
|
|
// the main idea is to make sure that the tail (a.k.a. padding) of each row
|
|
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
|
|
// does not contain NaNs or Infs. Because the padding in the weights
|
|
// matrix is explicitly initialized with 0's, we handle all other
|
|
// cases nicely, i.e. we can skip expliciting re-initialization
|
|
// of the padding - we just retain elements from the previous iteration
|
|
// of the loop over channels (cn0).
|
|
memset(rowbuf0, (int8_t)inpZp, rowbufsz*sizeof(rowbuf0[0]) );
|
|
}
|
|
|
|
for( int stripe = r.start; stripe < r.end; stripe++ )
|
|
{
|
|
int subsampleIdx = stripe/stripesPerSample;
|
|
if( subsampleIdx >= batchSize )
|
|
break;
|
|
int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
|
|
int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
|
|
const int8_t* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
|
|
int* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
|
|
int startOutCn = (subsampleIdx % ngroups)*outCn;
|
|
const int8_t* wptr_orig = wptr_orig_ + wstep*startOutCn;
|
|
const int* biasptr = biasptr_ + startOutCn;
|
|
const float* multptr = multptr_ + startOutCn;
|
|
|
|
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
|
|
{
|
|
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
|
|
int ncn = cn1 - cn0, vsz = karea*ncn;
|
|
int vsz_a = (int)alignSize(vsz, valign);
|
|
const int8_t* wptr = wptr_orig + cn0*karea;
|
|
|
|
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
|
|
{
|
|
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
|
|
int bsz = ofs1 - ofs0;
|
|
|
|
int out_d = ofs0 / (outH * outW);
|
|
int out_i = (ofs0 - out_d * outH * outW) / outW;
|
|
int out_j = ofs0 % outW;
|
|
|
|
if (depthWiseConvolution)
|
|
{
|
|
CV_Assert(out_i == 0 && out_j == 0);
|
|
int in_d = out_d * stride_d - pad_d;
|
|
const int8_t* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
|
|
int* outptr_ = data_out0 + ofs0;
|
|
|
|
#if CV_TRY_AVX2
|
|
if(useAVX2)
|
|
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
|
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
|
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
|
|
else
|
|
#endif
|
|
{
|
|
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
|
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
|
int bias = biasptr[out_d], biasCopy;
|
|
float mult = multptr[out_d];
|
|
|
|
for (int out_i = 0; out_i < outH; out_i++)
|
|
{
|
|
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
|
const int8_t* imgptr0 = inptr_ + in_i*width;
|
|
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
|
|
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
|
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
|
|
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
|
|
int out, out1;
|
|
// Bias has a fused offset component. bias = bias_quantized - input_zeropoint*sum_of_weights.
|
|
// In some cases below, certain weights are not used for convolution or set to zero.
|
|
// So we create a copy of bias at the start and remove the weight's components as necessary.
|
|
biasCopy = bias;
|
|
|
|
if (in_i < 0)
|
|
{
|
|
biasCopy += inpZp * (w00 + w01 + w02);
|
|
w00 = w01 = w02 = 0;
|
|
imgptr0 = imgptr1;
|
|
}
|
|
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
|
{
|
|
biasCopy += inpZp * (w20 + w21 + w22);
|
|
w20 = w21 = w22 = 0;
|
|
imgptr2 = imgptr1;
|
|
}
|
|
int* outptr = outptr_ + out_i*outW;
|
|
if (pad_l > 0)
|
|
{
|
|
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
|
|
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
|
|
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
|
|
biasCopy + inpZp*(w00 + w10 + w20);
|
|
out1 = outZp + (int)std::round(out*mult);
|
|
outptr[0] = std::min(std::max(out1, -128), 127);
|
|
out_j = 1;
|
|
}
|
|
#if CV_SIMD
|
|
if( stride_w == 1 )
|
|
{
|
|
const int out_delta = 16;
|
|
v_int8x16 vw00 = v_setall_s8(w00), vw01 = v_setall_s8(w01), vw02 = v_setall_s8(w02),
|
|
vw10 = v_setall_s8(w10), vw11 = v_setall_s8(w11), vw12 = v_setall_s8(w12),
|
|
vw20 = v_setall_s8(w20), vw21 = v_setall_s8(w21), vw22 = v_setall_s8(w22);
|
|
v_int32x4 vout0, vout1, vout2, vout3, vbias = v_setall_s32(biasCopy), voutzp = v_setall_s32(outZp),
|
|
outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
|
|
v_float32x4 vmult = v_setall_f32(mult);
|
|
for( ; out_j < outW1; out_j += out_delta )
|
|
{
|
|
if (out_j + out_delta > outW1)
|
|
{
|
|
if (out_j <= pad_l)
|
|
break;
|
|
out_j = outW1 - out_delta;
|
|
}
|
|
int in_j = out_j * stride_w - pad_l;
|
|
v_int8x16 v00 = v_load(imgptr0 + in_j),
|
|
v01 = v_load(imgptr0 + in_j + dilation_w),
|
|
v02 = v_load(imgptr0 + in_j + dilation_w*2),
|
|
v10 = v_load(imgptr1 + in_j),
|
|
v11 = v_load(imgptr1 + in_j + dilation_w),
|
|
v12 = v_load(imgptr1 + in_j + dilation_w*2),
|
|
v20 = v_load(imgptr2 + in_j),
|
|
v21 = v_load(imgptr2 + in_j + dilation_w),
|
|
v22 = v_load(imgptr2 + in_j + dilation_w*2);
|
|
|
|
vout0 = vout1 = vout2 = vout3 = vbias;
|
|
v_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
|
|
v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
|
|
|
|
vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
|
|
vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
|
|
vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
|
|
vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
|
|
|
|
vout0 = v_min(v_max(vout0, outmin), outmax);
|
|
vout1 = v_min(v_max(vout1, outmin), outmax);
|
|
vout2 = v_min(v_max(vout2, outmin), outmax);
|
|
vout3 = v_min(v_max(vout3, outmin), outmax);
|
|
|
|
v_store(outptr + out_j, vout0);
|
|
v_store(outptr + out_j + 4, vout1);
|
|
v_store(outptr + out_j + 8, vout2);
|
|
v_store(outptr + out_j + 12, vout3);
|
|
}
|
|
}
|
|
#endif
|
|
for (; out_j < outW1; out_j++)
|
|
{
|
|
int in_j = out_j * stride_w - pad_l;
|
|
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
|
|
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
|
|
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
|
|
out1 = outZp + (int)std::round(out*mult);
|
|
outptr[out_j] = std::min(std::max(out1, -128), 127);
|
|
}
|
|
|
|
for (; out_j < outW; out_j++ )
|
|
{
|
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
|
int s0 = 1, s1 = 1, s2 = 1;
|
|
if (in_j0 >= width)
|
|
{
|
|
in_j0 = 0;
|
|
s0 = 0;
|
|
biasCopy += inpZp*(w00 + w10 + w20);
|
|
}
|
|
if (in_j1 >= width)
|
|
{
|
|
in_j1 = 0;
|
|
s1 = 0;
|
|
biasCopy += inpZp*(w01 + w11 + w21);
|
|
}
|
|
if (in_j2 >= width)
|
|
{
|
|
in_j2 = 0;
|
|
s2 = 0;
|
|
biasCopy += inpZp*(w02 + w12 + w22);
|
|
}
|
|
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
|
|
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
|
|
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
|
|
out1 = outZp + (int)std::round(out*mult);
|
|
outptr[out_j] = std::min(std::max(out1, -128), 127);
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
// do im2row for a part of input tensor
|
|
int8_t* rowbuf = rowbuf0;
|
|
|
|
if (isConv1D)
|
|
{
|
|
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
|
|
{
|
|
int delta = std::min(ofs1 - ofs, outW - out_j);
|
|
int out_j1 = out_j + delta;
|
|
|
|
int in_j = out_j * stride_w - pad_l;
|
|
const int8_t* imgptr = data_inp0 + cn0*width + in_j;
|
|
ofs += delta;
|
|
|
|
// do im2row for a part of input tensor
|
|
if( is1x1 )
|
|
{
|
|
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
|
|
{
|
|
for( k = 0; k < vsz; k++ )
|
|
rowbuf[k] = imgptr[k*inpPlaneSize];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
|
|
{
|
|
// this condition should be true for most of the tensor elements, i.e.
|
|
// most of the time the kernel aperture is inside the tensor X-Y plane.
|
|
if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
|
|
{
|
|
for( k = 0; k < vsz; k++ )
|
|
{
|
|
int k1 = ofstab[k];
|
|
int8_t v0 = imgptr[k1];
|
|
int8_t v1 = imgptr[k1 + stride_w];
|
|
rowbuf[k] = v0;
|
|
rowbuf[k+vsz_a] = v1;
|
|
}
|
|
out_j++;
|
|
rowbuf += vsz_a;
|
|
imgptr += stride_w;
|
|
in_j += stride_w;
|
|
}
|
|
else
|
|
{
|
|
int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
|
|
int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
|
|
|
|
// here some non-continuous sub-row of the row will not be
|
|
// filled from the tensor; we need to make sure that the uncovered
|
|
// elements are explicitly set to 0's. the easiest way is to
|
|
// set all the elements to 0's before the loop.
|
|
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
|
|
for( k = 0; k < ncn; k++ )
|
|
{
|
|
for( i = i0; i < i1; i++ )
|
|
{
|
|
int imgofs = k*width + i*dilation_w;
|
|
rowbuf[k*kernel_w + i] = imgptr[imgofs];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else if (isConv2D)
|
|
{
|
|
if( is1x1 && stride_w == 1 && stride_h == 1 )
|
|
{
|
|
const int8_t* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
|
|
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
|
|
{
|
|
if( j + 4 <= bsz )
|
|
{
|
|
k = 0;
|
|
for( ; k < vsz; k++ )
|
|
{
|
|
const int8_t* inp = imgptr + j + k*inpPlaneSize;
|
|
int8_t v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
|
|
rowbuf[k] = v0;
|
|
rowbuf[k + vsz_a] = v1;
|
|
rowbuf[k + vsz_a*2] = v2;
|
|
rowbuf[k + vsz_a*3] = v3;
|
|
}
|
|
j += 3;
|
|
rowbuf += vsz_a*3;
|
|
}
|
|
else
|
|
{
|
|
for( k = 0; k < vsz; k++ )
|
|
{
|
|
rowbuf[k] = imgptr[j + k*inpPlaneSize];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
|
|
{
|
|
int delta = std::min(ofs1 - ofs, outW - out_j);
|
|
int out_j1 = out_j + delta;
|
|
|
|
int in_i = out_i * stride_h - pad_t;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
const int8_t* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
|
|
ofs += delta;
|
|
|
|
// do im2row for a part of input tensor
|
|
if( is1x1 )
|
|
{
|
|
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
|
|
{
|
|
for( k = 0; k < vsz; k++ )
|
|
rowbuf[k] = imgptr[k*inpPlaneSize];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
|
|
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
|
|
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
|
|
|
|
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
|
|
{
|
|
// this condition should be true for most of the tensor elements, i.e.
|
|
// most of the time the kernel aperture is inside the tensor X-Y plane.
|
|
if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
|
|
{
|
|
for( k = 0; k < vsz; k++ )
|
|
{
|
|
int k1 = ofstab[k];
|
|
int8_t v0 = imgptr[k1];
|
|
int8_t v1 = imgptr[k1 + stride_w];
|
|
rowbuf[k] = v0;
|
|
rowbuf[k+vsz_a] = v1;
|
|
}
|
|
out_j++;
|
|
rowbuf += vsz_a;
|
|
imgptr += stride_w;
|
|
in_j += stride_w;
|
|
}
|
|
else
|
|
{
|
|
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
|
|
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
|
|
|
|
// here some non-continuous sub-row of the row will not be
|
|
// filled from the tensor; we need to make sure that the uncovered
|
|
// elements are explicitly set to 0's. the easiest way is to
|
|
// set all the elements to 0's before the loop.
|
|
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
|
|
for( k = 0; k < ncn; k++ )
|
|
{
|
|
for( i = i0; i < i1; i++ )
|
|
{
|
|
for( j = j0; j < j1; j++ )
|
|
{
|
|
int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
|
|
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
|
|
{
|
|
int delta = std::min(ofs1 - ofs, outW - out_j);
|
|
int out_j1 = out_j + delta;
|
|
|
|
int in_d = out_d * stride_d - pad_d;
|
|
int in_i = out_i * stride_h - pad_t;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
const int8_t* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
|
|
ofs += delta;
|
|
|
|
int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
|
|
int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d);
|
|
|
|
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
|
|
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
|
|
|
|
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
|
|
{
|
|
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
|
|
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
|
|
|
|
// here some non-continuous sub-row of the row will not be
|
|
// filled from the tensor; we need to make sure that the uncovered
|
|
// elements are explicitly set to 0's. the easiest way is to
|
|
// set all the elements to 0's before the loop.
|
|
memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
|
|
for( k = 0; k < ncn; k++ )
|
|
{
|
|
for ( d = d0; d < d1; d++)
|
|
{
|
|
for( i = i0; i < i1; i++ )
|
|
{
|
|
for( j = j0; j < j1; j++ )
|
|
{
|
|
int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
|
|
rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// now compute dot product of the weights
|
|
// and im2row-transformed part of the tensor
|
|
#if CV_TRY_AVX512_SKX
|
|
if(useAVX512)
|
|
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
|
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
|
else
|
|
#endif
|
|
#if CV_TRY_AVX2
|
|
if(useAVX2)
|
|
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
|
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
|
else
|
|
#endif
|
|
for( int i = 0; i < outCn; i += 2 )
|
|
{
|
|
const int8_t* wptr0 = wptr + i*wstep;
|
|
const int8_t* wptr1 = wptr0 + wstep;
|
|
int* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
|
|
int* outptr1 = outptr0 + outPlaneSize;
|
|
int bias0 = biasptr[i], bias1 = biasptr[i+1];
|
|
float mult0 = multptr[i], mult1 = multptr[i+1];
|
|
|
|
if( i+1 >= outCn )
|
|
{
|
|
wptr1 = wptr0;
|
|
outptr1 = outptr0;
|
|
bias1 = bias0;
|
|
mult1 = mult0;
|
|
}
|
|
int j = 0;
|
|
#if CV_SIMD128
|
|
v_int32x4 voutzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
|
|
v_float32x4 vmult0 = v_setall_f32(mult0), vmult1 = v_setall_f32(mult1);
|
|
for( ; j <= bsz - 4; j += 4 )
|
|
{
|
|
const int8_t* rptr = rowbuf0 + j*vsz_a;
|
|
v_int32x4 s0, s1;
|
|
|
|
if( cn0 == 0 )
|
|
{
|
|
s0 = v_setall_s32(bias0);
|
|
s1 = v_setall_s32(bias1);
|
|
}
|
|
else
|
|
{
|
|
s0 = v_load(outptr0 + j);
|
|
s1 = v_load(outptr1 + j);
|
|
}
|
|
|
|
v_int32x4 vs00 = v_setzero_s32(), vs01 = v_setzero_s32(),
|
|
vs02 = v_setzero_s32(), vs03 = v_setzero_s32(),
|
|
vs10 = v_setzero_s32(), vs11 = v_setzero_s32(),
|
|
vs12 = v_setzero_s32(), vs13 = v_setzero_s32();
|
|
for( k = 0; k < vsz; k += 16, rptr += 16 )
|
|
{
|
|
v_int8x16 w0 = v_load_aligned(wptr0 + k);
|
|
v_int8x16 w1 = v_load_aligned(wptr1 + k);
|
|
v_int8x16 r0 = v_load_aligned(rptr);
|
|
v_int8x16 r1 = v_load_aligned(rptr + vsz_a);
|
|
v_int8x16 r2 = v_load_aligned(rptr + vsz_a*2);
|
|
v_int8x16 r3 = v_load_aligned(rptr + vsz_a*3);
|
|
|
|
vs00 = v_dotprod_expand_fast(w0, r0, vs00);
|
|
vs01 = v_dotprod_expand_fast(w0, r1, vs01);
|
|
vs02 = v_dotprod_expand_fast(w0, r2, vs02);
|
|
vs03 = v_dotprod_expand_fast(w0, r3, vs03);
|
|
|
|
vs10 = v_dotprod_expand_fast(w1, r0, vs10);
|
|
vs11 = v_dotprod_expand_fast(w1, r1, vs11);
|
|
vs12 = v_dotprod_expand_fast(w1, r2, vs12);
|
|
vs13 = v_dotprod_expand_fast(w1, r3, vs13);
|
|
}
|
|
s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
|
|
s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
|
|
if( cn1 == inpCn )
|
|
{
|
|
s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
|
|
s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
|
|
|
|
s0 = v_min(v_max(s0, outmin), outmax);
|
|
s1 = v_min(v_max(s1, outmin), outmax);
|
|
}
|
|
v_store(outptr0 + j, s0);
|
|
v_store(outptr1 + j, s1);
|
|
}
|
|
#endif
|
|
for( ; j < bsz; j++ )
|
|
{
|
|
const int8_t* rptr = rowbuf0 + j*vsz_a;
|
|
int s00, s10;
|
|
|
|
if( cn0 == 0 )
|
|
{
|
|
s00 = bias0;
|
|
s10 = bias1;
|
|
}
|
|
else
|
|
{
|
|
s00 = outptr0[j];
|
|
s10 = outptr1[j];
|
|
}
|
|
|
|
for( k = 0; k < vsz; k++ )
|
|
{
|
|
int8_t r0 = rptr[k];
|
|
s00 += (int)wptr0[k] * r0;
|
|
s10 += (int)wptr1[k] * r0;
|
|
}
|
|
if( cn1 == inpCn )
|
|
{
|
|
int out0 = outZp + (int)std::round(s00*mult0);
|
|
int out1 = outZp + (int)std::round(s10*mult1);
|
|
|
|
s00 = std::min(std::max(out0, -128), 127);
|
|
s10 = std::min(std::max(out1, -128), 127);
|
|
}
|
|
|
|
outptr0[j] = s00;
|
|
outptr1[j] = s10;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if( activ_ )
|
|
activ_->forwardSlice(data_out0 + stripeStart, lutptr_,
|
|
data_out0 + stripeStart, (int)(stripeEnd - stripeStart),
|
|
outPlaneSize, startOutCn, startOutCn + outCn);
|
|
}
|
|
}
|
|
};
|
|
|
|
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
|
{
|
|
CV_TRACE_FUNCTION();
|
|
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
|
|
|
#if CV_SSE3
|
|
uint32_t ftzMode = _MM_GET_FLUSH_ZERO_MODE();
|
|
uint32_t dazMode = _MM_GET_DENORMALS_ZERO_MODE();
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
|
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
|
#endif
|
|
|
|
std::vector<Mat> inputs, outputs;
|
|
inputs_arr.getMatVector(inputs);
|
|
outputs_arr.getMatVector(outputs);
|
|
|
|
/*if (inputs[0].dims > 3) {
|
|
printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
|
|
name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
|
|
kernel.width, kernel.height, pad.width, pad.height,
|
|
stride.width, stride.height, dilation.width, dilation.height);
|
|
}
|
|
else {
|
|
printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
|
|
name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2],
|
|
kernel.width, kernel.height, pad.width, pad.height,
|
|
stride.width, stride.height, dilation.width, dilation.height);
|
|
}*/
|
|
|
|
int inpGroupCn = blobs[0].size[1];
|
|
CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
|
|
outputs.size() == 1, inputs[0].data != outputs[0].data);
|
|
|
|
int ngroups = inputs[0].size[1] / inpGroupCn;
|
|
CV_Assert(outputs[0].size[1] % ngroups == 0);
|
|
|
|
int nstripes = std::max(getNumThreads(), 1);
|
|
Mat outputInt32 = Mat(shape(outputs[0]), CV_32S);
|
|
|
|
ParallelConv::run(inputs[0], outputInt32, weightsMat, outputMultiplier, biasvec, activationLUT, kernel_size, strides,
|
|
pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes, input_zp, output_zp);
|
|
|
|
outputInt32.convertTo(outputs[0], CV_8S);
|
|
|
|
#if CV_SSE3
|
|
_MM_SET_FLUSH_ZERO_MODE(ftzMode);
|
|
_MM_SET_DENORMALS_ZERO_MODE(dazMode);
|
|
#endif
|
|
}
|
|
|
|
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
|
|
const std::vector<MatShape> &outputs) const CV_OVERRIDE
|
|
{
|
|
CV_Assert(inputs.size() == outputs.size());
|
|
|
|
int64 flops = 0;
|
|
int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
|
|
for (int i = 0; i < outputs.size(); i++)
|
|
{
|
|
flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
|
|
}
|
|
return flops;
|
|
}
|
|
};
|
|
|
|
Ptr<BaseConvolutionLayer> ConvolutionLayerInt8::create(const LayerParams ¶ms)
|
|
{
|
|
return Ptr<BaseConvolutionLayer>(new ConvolutionLayerInt8Impl(params));
|
|
}
|
|
|
|
}
|
|
}
|