// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "../precomp.hpp" #include "layers_common.hpp" #include "../op_timvx.hpp" #include "../ie_ngraph.hpp" #include namespace cv { namespace dnn { class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8 { public: enum EltwiseOp { PROD = 0, SUM = 1, MAX = 2 } op; std::vector coeffs; std::vector zeropoints; std::vector scales; int output_zp; float output_sc; enum OutputChannelsMode { ELTWISE_CHANNNELS_SAME = 0, //!< number of channels from inputs must be the same and equal to output's number of channels ELTWISE_CHANNNELS_INPUT_0, //!< number of channels from inputs may be different, //!< output's number of channels is equal to number of channels of first input //!< number of channels of other inputs should not be greater than number of channels of first input ELTWISE_CHANNNELS_INPUT_0_TRUNCATE, //!< number of channels from inputs may be different, //!< output's number of channels is equal to number of channels of first input //!< there is restriction on number of channels of other inputs //!< extra channels of other inputs is ignored ELTWISE_CHANNNELS_USE_MAX, //!< number of channels from inputs may be different, //!< output's number of channels is equal to maximal number of input channels //!< @note supported operation: `SUM` } channelsModeInput; mutable OutputChannelsMode channelsMode; //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal) mutable /*size_t*/int outputChannels; EltwiseLayerInt8Impl(const LayerParams& params) : outputChannels(0) { setParamsFrom(params); offset = params.get("offset", 0.f); hasVecInput = false; op = SUM; if (params.has("operation")) { String operation = toLowerCase(params.get("operation")); if (operation == "prod") op = PROD; else if (operation == "sum") op = SUM; else if (operation == "max") op = MAX; else CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\""); } if (params.has("coeff")) { DictValue paramCoeff = params.get("coeff"); int i, n = paramCoeff.size(); coeffs.resize(n); for (i = 0; i < n; i++) { coeffs[i] = paramCoeff.get(i); } } if (params.has("input_zeropoints")) { DictValue zp = params.get("input_zeropoints"); int i, n = zp.size(); zeropoints.resize(n); for (i = 0; i < n; i++) { zeropoints[i] = zp.get(i); } } if (params.has("input_scales")) { DictValue sc = params.get("input_scales"); int i, n = sc.size(); scales.resize(n); for (i = 0; i < n; i++) { scales[i] = sc.get(i); } } output_zp = params.get("zeropoints"); output_sc = params.get("scales"); channelsModeInput = ELTWISE_CHANNNELS_SAME; if (params.has("output_channels_mode")) { String v = toLowerCase(params.get("output_channels_mode")); if (v == "same") { channelsModeInput = ELTWISE_CHANNNELS_SAME; } else if (v == "input_0") { channelsModeInput = ELTWISE_CHANNNELS_INPUT_0; } else if (v == "input_0_truncate") { channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE; } else if (v == "max_input_channels") { channelsModeInput = ELTWISE_CHANNNELS_USE_MAX; if (op != SUM) CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only"); } else CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\""); } channelsMode = channelsModeInput; // TODO Must have checks for other unknown options } virtual bool supportBackend(int backendId) CV_OVERRIDE { // For TimVX Backend, only ELTWISE_CHANNNELS_SAME was supported. if (backendId == DNN_BACKEND_TIMVX && haveTimVX()) return channelsModeInput == ELTWISE_CHANNNELS_SAME; return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH; } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const CV_OVERRIDE { CV_Assert(inputs.size() >= 2); CV_Assert(inputs[0].size() >= 2); CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); CV_Assert(op == SUM || op == PROD || coeffs.size() == 0); int dims = inputs[0].size(); // Number of channels in output shape is determined by the first input tensor. bool variableChannels = false; int numChannels = inputs[0][1]; for (size_t i = 1; i < inputs.size(); i++) { CV_Assert(inputs[0][0] == inputs[i][0]); // batch sizes are equal int input_channels = inputs[i][1]; if (numChannels != input_channels) variableChannels = true; if (channelsModeInput == ELTWISE_CHANNNELS_SAME) { CV_Assert(numChannels == input_channels); } else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0) { CV_Assert(numChannels >= input_channels); } else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) { // nothing to check } else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX) { numChannels = std::max(numChannels, input_channels); } else { CV_Assert(0 && "Internal error"); } } channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME; outputChannels = numChannels; outputs.assign(1, inputs[0]); outputs[0][1] = numChannels; if (dims > 2) { size_t vecIdx = 0; bool isVecFound = false; for (size_t i = 0; i < inputs.size(); i++) { bool allOnes = isAllOnes(inputs[i], 2, dims); if (!allOnes && !isVecFound) { vecIdx = i; isVecFound = true; } if (!allOnes && i != vecIdx) { for (size_t j = 2; j < dims; j++) { CV_Assert(inputs[vecIdx][j] == inputs[i][j]); } } } if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound) { for (size_t j = 2; j < dims; j++) { outputs[0][j] = inputs[vecIdx][j]; } } } return false; } void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE { std::vector inputs; inputs_arr.getMatVector(inputs); for (size_t i = 0; i < inputs.size(); i++) { MatShape inpShape = shape(inputs[i].size); if (isAllOnes(inpShape, 2, inputs[i].dims)) { hasVecInput = true; return; } } } virtual Ptr initTimVX(void* timVXInfo_, const std::vector > &inputsWrapper, const std::vector > &outputsWrapper, bool isLast) CV_OVERRIDE { #ifdef HAVE_TIMVX // tvGraph Initialization. if (inputsWrapper.size() != 2) return Ptr(); auto timVxInfo = reinterpret_cast(timVXInfo_); CV_Assert(timVxInfo); Ptr tvGraph = timVxInfo->getGraph(); CV_Assert(tvGraph); Ptr graph = tvGraph->graph; bool isSub = false; // TODO: support variable coeffs. if (op == SUM) { CV_Assert(coeffs.size() == scales.size()); std::vector originalCoeffs; for (int i = 0; i < coeffs.size(); i++) { originalCoeffs.push_back(coeffs[i] * output_sc / scales[i]); } float eps = std::numeric_limits::epsilon(); if (std::fabs(originalCoeffs[0] - 1.0f) <= eps * std::fabs(originalCoeffs[0] + 1.0f) && std::fabs(originalCoeffs[1] + 1.0f) <= eps * std::fabs(originalCoeffs[1] - 1.0f)) { // Sub, if coeffs = {1., -1.}, isSub = true. isSub = true; } else if (std::fabs(originalCoeffs[0] - 1.0f) <= eps * std::fabs(originalCoeffs[0] + 1.0f) && std::abs(originalCoeffs[1] - 1.0f) <= eps * std::abs(originalCoeffs[1] + 1.0f)) { // Sum, if coeff = {1., 1.}, isSub = false. isSub = false; } else { return Ptr(); } } std::vector inputsIndex, outputsIndex; int input_index = -1, output_index = -1; CV_Assert(channelsModeInput == ELTWISE_CHANNNELS_SAME); // Input Ptr inputWrapper; CV_Assert(!scales.empty() && !zeropoints.empty()); for (int i = 0; i(); if (inputWrapper->isTensor()) { input_index = tvGraph->getTensorIndex(inputWrapper->getTensor()); if (input_index == -1) { // Copy To New inputWrapper Mat tmp = inputWrapper->getMat(); inputWrapper = Ptr(new TimVXBackendWrapper(tmp)); } } if (!inputWrapper->isTensor()) { Ptr tvInputQuant = Ptr( new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, scales[i], zeropoints[i])); inputWrapper->createTensor(graph,tim::vx::TensorAttribute::INPUT, tvInputQuant); input_index = tvGraph->addWrapper(inputWrapper); } inputsIndex.push_back(input_index); } // Output CV_Assert(outputsWrapper.size() == 1); Ptr outputWrapper = outputsWrapper[0].dynamicCast(); Ptr outputQuant = Ptr( new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, output_sc, output_zp)); if (isLast) { auto shapeType = getShapeTypeFromMat(outputWrapper->getMat()); // For Graph Output tensor, we need to set tensor shape before createTensor(). outputWrapper->setTensorShape(shapeType); outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, outputQuant); } else { outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, outputQuant); } output_index = tvGraph->addWrapper(outputWrapper); outputsIndex.push_back(output_index); std::shared_ptr tvEltwise; switch (op) { case SUM: if (isSub) tvEltwise = graph->CreateOperation(); else tvEltwise = graph->CreateOperation(); break; case PROD: tvEltwise = graph->CreateOperation(); break; case MAX: tvEltwise = graph->CreateOperation(); break; default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation"); } Ptr tvBackendNode = new TimVXBackendNode(tvGraph, tvEltwise, inputsIndex, outputsIndex); return tvBackendNode; #endif // HAVE_TIMVX return Ptr(); } #ifdef HAVE_DNN_NGRAPH virtual Ptr initNgraph(const std::vector > &inputs, const std::vector >& nodes) CV_OVERRIDE { CV_Assert(nodes.size() >= 2); std::vector> ieInpNodes(nodes.size()); for (size_t i = 0; i < nodes.size(); i++) { ieInpNodes[i] = nodes[i].dynamicCast()->node; float input_sc = !coeffs.empty() ? coeffs[i] : 1.0f; float input_zp = op == PROD ? zeropoints[i] : 0.0f; ieInpNodes[i] = ngraphDequantize(ieInpNodes[i], input_sc, input_zp); } auto res = ieInpNodes[0]; for (size_t i = 1; i < ieInpNodes.size(); i++) { switch (op) { case SUM: res = std::make_shared(res, ieInpNodes[i]); break; case PROD: res = std::make_shared(res, ieInpNodes[i]); break; case MAX: res = std::make_shared(res, ieInpNodes[i]); break; default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation"); } } res = ngraphQuantize(res, 1.0f, offset); return new InfEngineNgraphNode(res); } #endif // HAVE_DNN_NGRAPH class EltwiseInvoker : public ParallelLoopBody { EltwiseLayerInt8Impl& self; std::vector srcs; std::vector srcNumChannels; int nsrcs; Mat* dst; Mat* buf; std::vector coeffs; std::vector zeropoints; int nstripes; const Mat* activLUT; const ActivationLayerInt8* activ; int channels; size_t planeSize; float offset; EltwiseInvoker(EltwiseLayerInt8Impl& self_) : self(self_) , nsrcs(0), dst(0), buf(0), nstripes(0), activLUT(0), activ(0), channels(0) , planeSize(0), offset(0) {} public: static void run(EltwiseLayerInt8Impl& self, const Mat* srcs, int nsrcs, Mat& buf, Mat& dst, int nstripes, float offset) { const EltwiseOp op = self.op; CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous()); CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs); CV_CheckGE(nsrcs, 2, ""); CV_Assert(self.outputChannels == dst.size[1]); EltwiseInvoker p(self); p.srcs.resize(nsrcs); p.srcNumChannels.resize(nsrcs); p.coeffs = self.coeffs; // can be sorted p.zeropoints = self.zeropoints; bool sortInputs = false; for( int i = 0; i < nsrcs; i++ ) { p.srcs[i] = &srcs[i]; CV_CheckEQ(srcs[i].dims, dst.dims, ""); CV_Assert(srcs[i].isContinuous()); CV_Assert(srcs[i].type() == dst.type()); p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1; if (self.channelsMode == ELTWISE_CHANNNELS_SAME) { CV_Assert(srcs[i].size == dst.size); } else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0) { if (i == 0) CV_Assert(srcs[0].size == dst.size); CV_Assert(self.outputChannels >= p.srcNumChannels[i]); sortInputs = true; } else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) { if (i == 0) CV_Assert(srcs[0].size == dst.size); sortInputs = true; } else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX) { CV_Assert(op == SUM); CV_Assert(self.outputChannels >= p.srcNumChannels[i]); sortInputs = true; } else { CV_Assert(0 && "Internal error"); } if (sortInputs) { // Sort srcs and coefficients in the desc order by number of channels for (int j = i; j >= 1; j--) { if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1])) { std::swap(p.srcs[j - 1], p.srcs[j]); std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]); if (!p.coeffs.empty()) std::swap(p.coeffs[j - 1], p.coeffs[j]); if (!p.zeropoints.empty()) std::swap(p.zeropoints[j - 1], p.zeropoints[j]); } else break; } } } p.nsrcs = nsrcs; p.dst = &dst; p.buf = &buf; p.nstripes = nstripes; p.offset = offset; p.channels = (dst.dims >= 4 ? dst.size[1] : 1); p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1); CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, ""); p.activLUT = &self.activationLUT; p.activ = !self.activationLUT.empty() ? self.activ.get() : 0; parallel_for_(Range(0, nstripes), p, nstripes); } void operator()(const Range& r) const CV_OVERRIDE { const EltwiseOp op = self.op; size_t total = dst->size[0]*planeSize; size_t stripeSize = (total + nstripes - 1)/nstripes; size_t stripeStart = r.start*stripeSize; size_t stripeEnd = std::min(r.end*stripeSize, total); const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0; const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0; const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr() : 0; int8_t* dstptr0 = dst->ptr(); float* bufptr0 = buf->ptr(); int blockSize0 = 1 << 12; CV_Assert(op != PROD || zeropointsptr); CV_Assert((op != PROD && op != SUM) || coeffsptr); for (size_t ofs = stripeStart; ofs < stripeEnd; ) { int sampleIdx = (int)(ofs / planeSize); int delta = (int)ofs - sampleIdx * planeSize; int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta)); if( blockSize <= 0 ) break; ofs += blockSize; for (int c = 0; c < channels; c++) { size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize; int8_t* dstptr = dstptr0 + dstIdx; float* bufptr = bufptr0 + dstIdx; // process first two inputs { const int8_t* srcptr0 = srcs[0]->ptr() + dstIdx; const int inputIdx = 1; int src1_channels = srcNumChannels[inputIdx]; if (c >= src1_channels) { // no data from second input if (!coeffsptr) { for (int j = 0; j < blockSize; j++) { dstptr[j] = srcptr0[j]; } } else { float c0 = coeffsptr[0]; int z0 = op == PROD ? zeropointsptr[0] : 0; for (int j = 0; j < blockSize; j++) { bufptr[j] = c0 * (srcptr0[j] - z0); } } } else { size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize; const int8_t* srcptrI = srcs[inputIdx]->ptr() + srcIdx; if (op == PROD) { float c0 = coeffsptr[0]; float c1 = coeffsptr[1]; int z0 = zeropointsptr[0]; int z1 = zeropointsptr[1]; for (int j = 0; j < blockSize; j++) { bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1)); } } else if (op == MAX) { for (int j = 0; j < blockSize; j++) { dstptr[j] = std::max(srcptr0[j], srcptrI[j]); } } else if (op == SUM) { float c0 = coeffsptr[0]; float c1 = coeffsptr[1]; for (int j = 0; j < blockSize; j++) { bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j]; } } else CV_Error(Error::StsInternal, ""); } } // aggregate other inputs (3+) for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++) { int srcI_channels = srcNumChannels[inputIdx]; if (c >= srcI_channels) continue; // no data from second input size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize; const int8_t* srcptrI = srcs[inputIdx]->ptr() + srcIdx; if (op == PROD) { float cI = coeffsptr[inputIdx]; int zI = zeropointsptr[inputIdx]; for (int j = 0; j < blockSize; j++) { bufptr[j] *= cI*(srcptrI[j] - zI); } } else if (op == MAX) { for (int j = 0; j < blockSize; j++) { dstptr[j] = std::max(dstptr[j], srcptrI[j]); } } else if (op == SUM) { float cI = coeffsptr[inputIdx]; for (int j = 0; j < blockSize; j++) { bufptr[j] += cI * srcptrI[j]; } } else CV_Error(Error::StsInternal, ""); } // add offset and saturate cast to int8 if (op == SUM || op == PROD) { for (int j = 0; j < blockSize; j++) { dstptr[j] = saturate_cast(std::round(bufptr[j] + offset)); } } } if( activ ) { int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize; activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels); } } } }; void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); CV_Assert(outputs.size() == 1); const int nstripes = getNumThreads(); if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2) { for (size_t i = 0; i < inputs.size(); i++) { MatShape inpShape = shape(inputs[i].size); bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims); if (allOnes) { Mat tmpInput = inputs[i]; MatShape outShape = shape(outputs[0].size); size_t xSize = outShape[2]; for (size_t j = 3; j < outShape.size(); j++) xSize *= outShape[j]; int dimVec[3] = {outShape[0], outShape[1], (int) xSize}; std::vector matSizesVec(&dimVec[0], &dimVec[0] + 3); inputs[i] = Mat(matSizesVec, tmpInput.type()); std::vector idx(outShape.size(), 0); std::vector outIdx(inpShape.size(), 0); for (size_t j = 0; j < outShape[0]; j++) { outIdx[0] = idx[0] = j; for(size_t k = 0; k < outShape[1]; k++) { outIdx[1] = idx[1] = k; for (size_t x = 0; x < xSize; x++) { outIdx[2] = x; inputs[i].at(outIdx.data()) = tmpInput.at(idx.data()); } } } inputs[i] = inputs[i].reshape(0, outShape); } } } Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset); } virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { CV_UNUSED(outputs); // suppress unused variable warning CV_Assert(inputs.size()); // FIXIT: handle inputs with different number of channels long flops = inputs.size() * total(inputs[0]); return flops; } bool setActivation(const Ptr& layer) CV_OVERRIDE { Ptr activ_int8 = layer.dynamicCast(); if (!activ_int8.empty()) { activ = activ_int8; if (!activ_int8->blobs.empty()) activationLUT = activ_int8->blobs[0]; return true; } return false; } Mat activationLUT; Ptr activ; private: bool hasVecInput; float offset; }; Ptr EltwiseLayerInt8::create(const LayerParams& params) { return Ptr(new EltwiseLayerInt8Impl(params)); } } }