// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "../precomp.hpp" #include "layers_common.hpp" #include "../op_timvx.hpp" #include "../ie_ngraph.hpp" #include #include namespace cv { namespace dnn { class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8 { public: float input_sc; int input_zp; SoftMaxLayerInt8Impl(const LayerParams& params) { setParamsFrom(params); axis = params.get("axis", 1); logSoftMax = params.get("log_softmax", false); coerced_2d = params.get("coerced_2d", false); input_sc = params.get("input_scale"); input_zp = params.get("input_zeropoint"); output_sc = params.get("scales"); output_zp = params.get("zeropoints"); if (blobs.empty()) // if no lookUpTable is found { Mat lookUpTable(1, 256, CV_32F); float* table = lookUpTable.ptr(); for (int i = -128; i < 128; i++) { float x = input_sc * (i - 127); // ensures exp(x) is always between (0, 1) table[i + 128] = std::exp(x); } blobs.push_back(lookUpTable); } } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const CV_OVERRIDE { bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); MatShape shape = inputs[0]; internals.assign(1, shape); return inplace; } virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector inputs; inputs_arr.getMatVector(inputs); auto src = inputs[0]; auto dims_src = src.dims; auto shape_src = shape(src); axis = normalize_axis(axis, dims_src); if (!coerced_2d) { is_transpose_needed = (axis == dims_src - 1) ? false : true; if (is_transpose_needed) { permutation.resize(dims_src); std::iota(permutation.begin(), permutation.end(), 0); permutation[axis] = dims_src - 1; permutation[dims_src - 1] = axis; transposed_shape.resize(dims_src); std::transform(permutation.begin(), permutation.end(), transposed_shape.begin(), [&shape_src](int axis) { return shape_src[axis]; }); N = std::accumulate(transposed_shape.begin(), transposed_shape.end() - 1, 1, std::multiplies()); D = transposed_shape.back(); return; } } N = src.total(0, axis); D = src.total(axis); } virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || (backendId == DNN_BACKEND_TIMVX && haveTimVX()) || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH; } virtual bool tryFuse(Ptr& top) CV_OVERRIDE { Ptr dequantize_layer = top.dynamicCast(); return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16; } virtual Ptr initTimVX(void* timVXInfo_, const std::vector > &inputsWrapper, const std::vector > &outputsWrapper, bool isLast) CV_OVERRIDE { #ifdef HAVE_TIMVX // tvGraph Initialization. auto timVxInfo = reinterpret_cast(timVXInfo_); CV_Assert(timVxInfo); Ptr tvGraph = timVxInfo->getGraph(); CV_Assert(tvGraph); Ptr graph = tvGraph->graph; std::vector inputsIndex, outputsIndex; int input_index, output_index; // input Tensor CV_Assert(inputsWrapper.size() == 1); Ptr inputWrapper = inputsWrapper[0].dynamicCast(); const Mat &src = inputWrapper->getMat(); // convert axis from OpenCV NCHW toTimVX WHCN. int tvAxis = src.dims - 1 - normalize_axis(axis, src.dims); if(tvAxis < 0) tvAxis = 0; // default value is 0. if (inputWrapper->isTensor()) { input_index = tvGraph->getTensorIndex(inputWrapper->getTensor()); if (input_index == -1) { // Copy To New inputWrapper Mat tmp = inputWrapper->getMat(); inputWrapper = Ptr(new TimVXBackendWrapper(tmp)); } } if (!inputWrapper->isTensor()) { Ptr tvInputQuant = Ptr( new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, input_sc, input_zp)); inputWrapper->createTensor(graph,tim::vx::TensorAttribute::INPUT, tvInputQuant); input_index = tvGraph->addWrapper(inputWrapper); } inputsIndex.push_back(input_index); // output tensor CV_Assert(outputsWrapper.size() == 1); Ptr outputWrapper = outputsWrapper[0].dynamicCast(); Mat dstMat = outputWrapper->getMat(); Ptr outputQuant = Ptr( new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, output_sc, output_zp)); Ptr outputTensor; if (isLast) { auto shapeType = getShapeTypeFromMat(outputWrapper->getMat()); // For Graph Output tensor, we need to set tensor shape before createTensor(). outputWrapper->setTensorShape(shapeType); if (dstMat.type() == CV_32F) outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT); else outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, outputQuant); } else { if (dstMat.type() == CV_32F) outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT); else outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, outputQuant); } output_index = tvGraph->addWrapper(outputWrapper); outputsIndex.push_back(output_index); std::shared_ptr tvSoftmax; if (logSoftMax) { tvSoftmax = graph->CreateOperation(tvAxis); } else { tvSoftmax = graph->CreateOperation(1.0f, tvAxis); } Ptr tvBackendNode = new TimVXBackendNode(tvGraph, tvSoftmax, inputsIndex, outputsIndex); return tvBackendNode; #endif // HAVE_TIMVX return Ptr(); } #ifdef HAVE_DNN_NGRAPH virtual Ptr initNgraph(const std::vector > &inputs, const std::vector >& nodes) CV_OVERRIDE { auto input = nodes[0].dynamicCast()->node; input = ngraphDequantize(input, input_sc, input_zp); ngraph::Output res; if (logSoftMax) { res = std::make_shared(input, axis); } else { res = std::make_shared(input, axis); } res = ngraphQuantize(res, output_sc, output_zp); return new InfEngineNgraphNode(res); } #endif // HAVE_DNN_NGRAPH template class SoftmaxInt8Invoker : public ParallelLoopBody { public: const Mat& src_; Mat& dst_; const Mat& lookup_table_; int N_; int D_; float y_scale_; int y_zero_point_; int threads; int cost_per_thread; SoftmaxInt8Invoker(const Mat& src, Mat& dst, const Mat& lookup_table, int N, int D, float y_scale, int y_zero_point) : src_(src), dst_(dst), lookup_table_(lookup_table), N_(N), D_(D), y_scale_(1.f / y_scale), y_zero_point_(y_zero_point) { threads = N_; cost_per_thread = D_; } static void run(const Mat& src, Mat& dst, const Mat& lookup_table, int N, int D, float y_scale, int y_zero_point) { CV_Assert(src.isContinuous()); CV_Assert(dst.isContinuous()); CV_CheckTypeEQ(src.type(), CV_8S, "DNN/SoftmaxInt8: type of input must be int8"); CV_CheckTypeEQ(dst.type(), CV_8S, "DNN/SoftmaxInt8: type of output must be int8"); SoftmaxInt8Invoker p(src, dst, lookup_table, N, D, y_scale, y_zero_point); double nstripes = ((size_t)p.threads * p.cost_per_thread) * (1 / 1024.0); parallel_for_(Range(0, p.threads), p, nstripes); } void operator()(const Range& r) const CV_OVERRIDE { int start = r.start; int end = r.end; const int8_t* p_src = src_.ptr(); int8_t* p_dst = dst_.ptr(); const float* table = lookup_table_.ptr(); for (int i = start; i < end; ++i) { const int8_t* x = p_src + i * D_; int8_t* y = p_dst + i * D_; float vsum = 0; for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); vsum += table[idx]; } // FIXME: avoid divide by vsum==0 x = p_src + i * D_; if (with_log) { for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); const float v = table[idx]; *y++ = saturate_cast(std::nearbyintf(y_scale_ * std::log(v / vsum)) + y_zero_point_); } } else { for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); const float v = table[idx]; *y++ = saturate_cast(std::nearbyintf(y_scale_ * v / vsum) + y_zero_point_); } } } } }; template class SoftmaxInt8OutputFloatInvoker : public ParallelLoopBody { public: const Mat& src_; Mat& dst_; const Mat& lookup_table_; int N_; int D_; int threads; int cost_per_thread; SoftmaxInt8OutputFloatInvoker(const Mat& src, Mat& dst, const Mat& lookup_table, int N, int D) : src_(src), dst_(dst), lookup_table_(lookup_table), N_(N), D_(D) { threads = N_; cost_per_thread = D_; } static void run(const Mat& src, Mat& dst, const Mat& lookup_table, int N, int D) { CV_Assert(src.isContinuous()); CV_Assert(dst.isContinuous()); CV_CheckTypeEQ(src.type(), CV_8S, "DNN/SoftmaxInt8: type of input must be int8"); CV_CheckTypeEQ(dst.type(), CV_32F, "DNN/SoftmaxInt8: type of input must be float32 since Dequantization is fused"); SoftmaxInt8OutputFloatInvoker p(src, dst, lookup_table, N, D); double nstripes = ((size_t)p.threads * p.cost_per_thread) * (1 / 1024.0); parallel_for_(Range(0, p.threads), p, nstripes); } void operator()(const Range& r) const CV_OVERRIDE { int start = r.start; int end = r.end; const int8_t* p_src = src_.ptr(); float* p_dst = dst_.ptr(); const float* table = lookup_table_.ptr(); for (int i = start; i < end; ++i) { const int8_t* x = p_src + i * D_; float* y = p_dst + i * D_; float vsum = 0; for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); vsum += table[idx]; } // FIXME: avoid divide by vsum==0 x = p_src + i * D_; if (with_log) { for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); const float v = table[idx]; *y++ = std::log(v / vsum); } } else { for (int j = 0; j < D_; ++j) { const uint8_t idx = uint8_t((*x++) + 128); const float v = table[idx]; *y++ = v / vsum; } } } } }; void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); Mat src, dst; if (!coerced_2d && is_transpose_needed) { transposeND(inputs[0], permutation, src); dst = Mat::zeros(transposed_shape.size(), transposed_shape.data(), outputs[0].type()); } else { src = inputs[0]; dst = outputs[0]; } switch (dst.type()) { case CV_8S: { if (logSoftMax) { SoftmaxInt8Invoker::run(src, dst, blobs[0], N, D, output_sc, output_zp); } else { SoftmaxInt8Invoker::run(src, dst, blobs[0], N, D, output_sc, output_zp); } } break; case CV_32F: { if (logSoftMax) { SoftmaxInt8OutputFloatInvoker::run(src, dst, blobs[0], N, D); } else { SoftmaxInt8OutputFloatInvoker::run(src, dst, blobs[0], N, D); } } break; default: CV_Error(cv::Error::BadDepth, "DNN/SoftmaxInt8: Unsupported output type"); } if (!coerced_2d && is_transpose_needed) { transposeND(dst, permutation, outputs[0]); } } int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { CV_UNUSED(outputs); // suppress unused variable warning int64 flops = 0; for (int i = 0; i < inputs.size(); i++) { flops += 4*total(inputs[i]); } return flops; } private: int axis; int N; int D; bool coerced_2d; bool is_transpose_needed; std::vector permutation; std::vector transposed_shape; }; Ptr SoftmaxLayerInt8::create(const LayerParams& params) { return Ptr(new SoftMaxLayerInt8Impl(params)); } } }