diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index d4c962e741..6357e9a7c7 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -235,6 +235,17 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN) Mat(cv::Size(800, 600), CV_32FC3)); } +PERF_TEST_P_(DNNTestNetwork, EfficientDet) +{ + if (backend == DNN_BACKEND_HALIDE || target != DNN_TARGET_CPU) + throw SkipTestException(""); + Mat sample = imread(findDataFile("dnn/dog416.png")); + resize(sample, sample, Size(512, 512)); + Mat inp; + sample.convertTo(inp, CV_32FC3, 1.0/255); + processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", "", inp); +} + INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets()); } // namespace diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index e684b94e46..c005c99b58 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -1542,22 +1542,32 @@ void TFImporter::populateNet(Net dstNet) connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); } - else if (type == "Mul") + else if (type == "Mul" || type == "RealDiv") { - bool haveConst = false; - for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii) + int constId = -1; + for(int ii = 0; ii < layer.input_size(); ++ii) { Pin input = parsePin(layer.input(ii)); - haveConst = value_id.find(input.name) != value_id.end(); + if (value_id.find(input.name) != value_id.end()) + { + constId = ii; + break; + } } - CV_Assert(!haveConst || layer.input_size() == 2); + CV_Assert((constId != -1) || (layer.input_size() == 2)); - if (haveConst) + if (constId != -1) { // Multiplication by constant. CV_Assert(layer.input_size() == 2); Mat scaleMat = getTensorContent(getConstBlob(layer, value_id)); CV_Assert(scaleMat.type() == CV_32FC1); + if (type == "RealDiv") + { + if (constId == 0) + CV_Error(Error::StsNotImplemented, "Division of constant over variable"); + scaleMat = 1.0f / scaleMat; + } int id; if (scaleMat.total() == 1) // is a scalar. @@ -1659,11 +1669,15 @@ void TFImporter::populateNet(Net dstNet) int id; if (equalInpShapes || netInputShapes.empty()) { - layerParams.set("operation", "prod"); + layerParams.set("operation", type == "RealDiv" ? "div" : "prod"); id = dstNet.addLayer(name, "Eltwise", layerParams); } else + { + if (type == "RealDiv") + CV_Error(Error::StsNotImplemented, "Division of non equal tensors"); id = dstNet.addLayer(name, "Scale", layerParams); + } layer_id[name] = id; diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index b20b2a58ff..d10e847e00 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -1123,4 +1123,37 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN) expectNoFallbacks(net); } +TEST_P(Test_TensorFlow_nets, EfficientDet) +{ + if (target != DNN_TARGET_CPU) + { + if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL); + if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); + } + checkBackend(); + std::string proto = findDataFile("dnn/efficientdet-d0.pbtxt"); + std::string model = findDataFile("dnn/efficientdet-d0.pb"); + + Net net = readNetFromTensorflow(model, proto); + Mat img = imread(findDataFile("dnn/dog416.png")); + Mat blob = blobFromImage(img, 1.0/255, Size(512, 512), Scalar(123.675, 116.28, 103.53)); + + net.setPreferableBackend(backend); + net.setPreferableTarget(target); + net.setInput(blob); + // Output has shape 1x1xNx7 where N - number of detections. + // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom] + Mat out = net.forward(); + + // References are from test for TensorFlow model. + Mat ref = (Mat_(3, 7) << 0, 1, 0.8437444, 0.153996080160141, 0.20534580945968628, 0.7463544607162476, 0.7414066195487976, + 0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541, + 0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661); + double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 4e-3 : 1e-5; + double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-4; + normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff); + expectNoFallbacksFromIE(net); +} + } diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py index 5aa1d30e39..ea24898873 100644 --- a/samples/dnn/tf_text_graph_common.py +++ b/samples/dnn/tf_text_graph_common.py @@ -269,7 +269,7 @@ def parseTextGraph(filePath): def removeIdentity(graph_def): identities = {} for node in graph_def.node: - if node.op == 'Identity': + if node.op == 'Identity' or node.op == 'IdentityN': identities[node.name] = node.input[0] graph_def.node.remove(node) diff --git a/samples/dnn/tf_text_graph_efficientdet.py b/samples/dnn/tf_text_graph_efficientdet.py new file mode 100644 index 0000000000..855691b2be --- /dev/null +++ b/samples/dnn/tf_text_graph_efficientdet.py @@ -0,0 +1,236 @@ +# This file is a part of OpenCV project. +# It is a subject to the license terms in the LICENSE file found in the top-level directory +# of this distribution and at http://opencv.org/license.html. +# +# Copyright (C) 2020, Intel Corporation, all rights reserved. +# Third party copyrights are property of their respective owners. +# +# Use this script to get the text graph representation (.pbtxt) of EfficientDet +# deep learning network trained in https://github.com/google/automl. +# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function. +# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API +import argparse +import re +from math import sqrt +from tf_text_graph_common import * + + +class AnchorGenerator: + def __init__(self, min_level, aspect_ratios, num_scales, anchor_scale): + self.min_level = min_level + self.aspect_ratios = aspect_ratios + self.anchor_scale = anchor_scale + self.scales = [2**(float(s) / num_scales) for s in range(num_scales)] + + def get(self, layer_id): + widths = [] + heights = [] + for s in self.scales: + for a in self.aspect_ratios: + base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale + heights.append(base_anchor_size * s * a[1]) + widths.append(base_anchor_size * s * a[0]) + return widths, heights + + +def createGraph(modelPath, outputPath, min_level, aspect_ratios, num_scales, + anchor_scale, num_classes, image_width, image_height): + print('Min level: %d' % min_level) + print('Anchor scale: %f' % anchor_scale) + print('Num scales: %d' % num_scales) + print('Aspect ratios: %s' % str(aspect_ratios)) + print('Number of classes: %d' % num_classes) + print('Input image size: %dx%d' % (image_width, image_height)) + + # Read the graph. + _inpNames = ['image_arrays'] + outNames = ['detections'] + + writeTextGraph(modelPath, outputPath, outNames) + graph_def = parseTextGraph(outputPath) + + def getUnconnectedNodes(): + unconnected = [] + for node in graph_def.node: + if node.op == 'Const': + continue + unconnected.append(node.name) + for inp in node.input: + if inp in unconnected: + unconnected.remove(inp) + return unconnected + + + nodesToKeep = ['truediv'] # Keep preprocessing nodes + + removeIdentity(graph_def) + + scopesToKeep = ('image_arrays', 'efficientnet', 'resample_p6', 'resample_p7', + 'fpn_cells', 'class_net', 'box_net', 'Reshape', 'concat') + + addConstNode('scale_w', [2.0], graph_def) + addConstNode('scale_h', [2.0], graph_def) + nodesToKeep += ['scale_w', 'scale_h'] + + for node in graph_def.node: + if re.match('efficientnet-(.*)/blocks_\d+/se/mul_1', node.name): + node.input[0], node.input[1] = node.input[1], node.input[0] + + if re.match('fpn_cells/cell_\d+/fnode\d+/resample(.*)/nearest_upsampling/Reshape_1$', node.name): + node.op = 'ResizeNearestNeighbor' + node.input[1] = 'scale_w' + node.input.append('scale_h') + + for inpNode in graph_def.node: + if inpNode.name == node.name[:node.name.rfind('_')]: + node.input[0] = inpNode.input[0] + + if re.match('box_net/box-predict(_\d)*/separable_conv2d$', node.name): + node.addAttr('loc_pred_transposed', True) + + # Replace RealDiv to Mul with inversed scale for compatibility + if node.op == 'RealDiv': + for inpNode in graph_def.node: + if inpNode.name != node.input[1] or not 'value' in inpNode.attr: + continue + + tensor = inpNode.attr['value']['tensor'][0] + if not 'float_val' in tensor: + continue + scale = float(inpNode.attr['value']['tensor'][0]['float_val'][0]) + + addConstNode(inpNode.name + '/inv', [1.0 / scale], graph_def) + nodesToKeep.append(inpNode.name + '/inv') + node.input[1] = inpNode.name + '/inv' + node.op = 'Mul' + break + + + def to_remove(name, op): + if name in nodesToKeep: + return False + return op == 'Const' or not name.startswith(scopesToKeep) + + removeUnusedNodesAndAttrs(to_remove, graph_def) + + # Attach unconnected preprocessing + assert(graph_def.node[1].name == 'truediv' and graph_def.node[1].op == 'RealDiv') + graph_def.node[1].input.insert(0, 'image_arrays') + graph_def.node[2].input.insert(0, 'truediv') + + priors_generator = AnchorGenerator(min_level, aspect_ratios, num_scales, anchor_scale) + priorBoxes = [] + for i in range(5): + inpName = '' + for node in graph_def.node: + if node.name == 'Reshape_%d' % (i * 2 + 1): + inpName = node.input[0] + break + + priorBox = NodeDef() + priorBox.name = 'PriorBox_%d' % i + priorBox.op = 'PriorBox' + priorBox.input.append(inpName) + priorBox.input.append(graph_def.node[0].name) # image_tensor + + priorBox.addAttr('flip', False) + priorBox.addAttr('clip', False) + + widths, heights = priors_generator.get(i) + + priorBox.addAttr('width', widths) + priorBox.addAttr('height', heights) + priorBox.addAttr('variance', [1.0, 1.0, 1.0, 1.0]) + + graph_def.node.extend([priorBox]) + priorBoxes.append(priorBox.name) + + addConstNode('concat/axis_flatten', [-1], graph_def) + + def addConcatNode(name, inputs, axisNodeName): + concat = NodeDef() + concat.name = name + concat.op = 'ConcatV2' + for inp in inputs: + concat.input.append(inp) + concat.input.append(axisNodeName) + graph_def.node.extend([concat]) + + addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten') + + sigmoid = NodeDef() + sigmoid.name = 'concat/sigmoid' + sigmoid.op = 'Sigmoid' + sigmoid.input.append('concat') + graph_def.node.extend([sigmoid]) + + addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def) + addFlatten('concat_1', 'concat_1/Flatten', graph_def) + + detectionOut = NodeDef() + detectionOut.name = 'detection_out' + detectionOut.op = 'DetectionOutput' + + detectionOut.input.append('concat_1/Flatten') + detectionOut.input.append(sigmoid.name + '/Flatten') + detectionOut.input.append('PriorBox/concat') + + detectionOut.addAttr('num_classes', num_classes) + detectionOut.addAttr('share_location', True) + detectionOut.addAttr('background_label_id', num_classes + 1) + detectionOut.addAttr('nms_threshold', 0.6) + detectionOut.addAttr('confidence_threshold', 0.2) + detectionOut.addAttr('top_k', 100) + detectionOut.addAttr('keep_top_k', 100) + detectionOut.addAttr('code_type', "CENTER_SIZE") + graph_def.node.extend([detectionOut]) + + graph_def.node[0].attr['shape'] = { + 'shape': { + 'dim': [ + {'size': -1}, + {'size': image_height}, + {'size': image_width}, + {'size': 3} + ] + } + } + + while True: + unconnectedNodes = getUnconnectedNodes() + unconnectedNodes.remove(detectionOut.name) + if not unconnectedNodes: + break + + for name in unconnectedNodes: + for i in range(len(graph_def.node)): + if graph_def.node[i].name == name: + del graph_def.node[i] + break + + # Save as text + graph_def.save(outputPath) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run this script to get a text graph of ' + 'SSD model from TensorFlow Object Detection API. ' + 'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.') + parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.') + parser.add_argument('--output', required=True, help='Path to output text graph.') + parser.add_argument('--min_level', default=3, type=int, help='Parameter from training config') + parser.add_argument('--num_scales', default=3, type=int, help='Parameter from training config') + parser.add_argument('--anchor_scale', default=4.0, type=float, help='Parameter from training config') + parser.add_argument('--aspect_ratios', default=[1.0, 1.0, 1.4, 0.7, 0.7, 1.4], + nargs='+', type=float, help='Parameter from training config') + parser.add_argument('--num_classes', default=90, type=int, help='Number of classes to detect') + parser.add_argument('--width', default=512, type=int, help='Network input width') + parser.add_argument('--height', default=512, type=int, help='Network input height') + args = parser.parse_args() + + ar = args.aspect_ratios + assert(len(ar) % 2 == 0) + ar = list(zip(ar[::2], ar[1::2])) + + createGraph(args.input, args.output, args.min_level, ar, args.num_scales, + args.anchor_scale, args.num_classes, args.width, args.height)