Set output layers names and types for models in DLDT's intermediate representation

2025-06-07 17:44:04 +08:00 · 2018-06-28 09:09:11 +03:00 · 2018-06-28 09:09:11 +03:00 · 346871e27f
commit 346871e27f
parent e4b51fa8ad
4 changed files with 81 additions and 49 deletions
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1993,11 +1993,17 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
    backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
    for (auto& it : ieNet.getOutputsInfo())
    {
+        Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
+        InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
+        CV_Assert(ieLayer);
+
        LayerParams lp;
        int lid = cvNet.addLayer(it.first, "", lp);

        LayerData& ld = cvNet.impl->layers[lid];
-        ld.layerInstance = Ptr<Layer>(new InfEngineBackendLayer(it.second));
+        cvLayer->name = it.first;
+        cvLayer->type = ieLayer->type;
+        ld.layerInstance = cvLayer;
        ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;

        for (int i = 0; i < inputsNames.size(); ++i)
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -925,6 +925,10 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)
    Mat out = net.forward();

    normAssert(outDefault, out);
+
+    std::vector<int> outLayers = net.getUnconnectedOutLayers();
+    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge");
+    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat");
 }

 // 1. Create a .prototxt file with the following network:
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@ -22,6 +22,7 @@ const char* keys =
    "{ height      | -1 | Preprocess input image by resizing to a specific height. }"
    "{ rgb         |    | Indicate that model works with RGB input images instead BGR ones. }"
    "{ thr         | .5 | Confidence threshold. }"
+    "{ thr         | .4 | Non-maximum suppression threshold. }"
    "{ backend     |  0 | Choose one of computation backends: "
                         "0: automatically (by default), "
                         "1: Halide language (http://halide-lang.org/), "
@ -37,7 +38,7 @@ const char* keys =
 using namespace cv;
 using namespace dnn;

-float confThreshold;
+float confThreshold, nmsThreshold;
 std::vector<std::string> classes;

 void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);
@ -59,6 +60,7 @@ int main(int argc, char** argv)
    }

    confThreshold = parser.get<float>("thr");
+    nmsThreshold = parser.get<float>("nms");
    float scale = parser.get<float>("scale");
    Scalar mean = parser.get<Scalar>("mean");
    bool swapRB = parser.get<bool>("rgb");
@ -144,6 +146,9 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;

+    std::vector<int> classIds;
+    std::vector<float> confidences;
+    std::vector<Rect> boxes;
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
@ -160,8 +165,11 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                int top = (int)data[i + 4];
                int right = (int)data[i + 5];
                int bottom = (int)data[i + 6];
-                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
-                drawPred(classId, confidence, left, top, right, bottom, frame);
+                int width = right - left + 1;
+                int height = bottom - top + 1;
+                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
+                boxes.push_back(Rect(left, top, width, height));
+                confidences.push_back(confidence);
            }
        }
    }
@ -181,16 +189,16 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                int top = (int)(data[i + 4] * frame.rows);
                int right = (int)(data[i + 5] * frame.cols);
                int bottom = (int)(data[i + 6] * frame.rows);
-                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
-                drawPred(classId, confidence, left, top, right, bottom, frame);
+                int width = right - left + 1;
+                int height = bottom - top + 1;
+                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
+                boxes.push_back(Rect(left, top, width, height));
+                confidences.push_back(confidence);
            }
        }
    }
    else if (outLayerType == "Region")
    {
-        std::vector<int> classIds;
-        std::vector<float> confidences;
-        std::vector<Rect> boxes;
        for (size_t i = 0; i < outs.size(); ++i)
        {
            // Network produces output blob with a shape NxC where N is a number of
@ -218,8 +226,12 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                }
            }
        }
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
+
    std::vector<int> indices;
-        NMSBoxes(boxes, confidences, confThreshold, 0.4f, indices);
+    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i)
    {
        int idx = indices[i];
@ -228,9 +240,6 @@ void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
                 box.x + box.width, box.y + box.height, frame);
    }
 }
-    else
-        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
-}

 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
 {
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -31,6 +31,7 @@ parser.add_argument('--height', type=int,
 parser.add_argument('--rgb', action='store_true',
                    help='Indicate that model works with RGB input images instead BGR ones.')
 parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
+parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
 parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
                    help="Choose one of computation backends: "
                         "%d: automatically (by default), "
@ -57,6 +58,7 @@ net.setPreferableBackend(args.backend)
 net.setPreferableTarget(args.target)

 confThreshold = args.thr
+nmsThreshold = args.nms

 def getOutputsNames(net):
    layersNames = net.getLayerNames()
@ -86,12 +88,14 @@ def postprocess(frame, outs):
    lastLayerId = net.getLayerId(layerNames[-1])
    lastLayer = net.getLayer(lastLayerId)

+    classIds = []
+    confidences = []
+    boxes = []
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
-        assert(len(outs) == 1)
-        out = outs[0]
+        for out in outs:
            for detection in out[0, 0]:
                confidence = detection[2]
                if confidence > confThreshold:
@ -99,14 +103,16 @@ def postprocess(frame, outs):
                    top = int(detection[4])
                    right = int(detection[5])
                    bottom = int(detection[6])
-                classId = int(detection[1]) - 1  # Skip background label
-                drawPred(classId, confidence, left, top, right, bottom)
+                    width = right - left + 1
+                    height = bottom - top + 1
+                    classIds.append(int(detection[1]) - 1)  # Skip background label
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
    elif lastLayer.type == 'DetectionOutput':
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
-        assert(len(outs) == 1)
-        out = outs[0]
+        for out in outs:
            for detection in out[0, 0]:
                confidence = detection[2]
                if confidence > confThreshold:
@ -114,8 +120,11 @@ def postprocess(frame, outs):
                    top = int(detection[4] * frameHeight)
                    right = int(detection[5] * frameWidth)
                    bottom = int(detection[6] * frameHeight)
-                classId = int(detection[1]) - 1  # Skip background label
-                drawPred(classId, confidence, left, top, right, bottom)
+                    width = right - left + 1
+                    height = bottom - top + 1
+                    classIds.append(int(detection[1]) - 1)  # Skip background label
+                    confidences.append(float(confidence))
+                    boxes.append([left, top, width, height])
    elif lastLayer.type == 'Region':
        # Network produces output blob with a shape NxC where N is a number of
        # detected objects and C is a number of classes + 4 where the first 4
@ -138,7 +147,11 @@ def postprocess(frame, outs):
                    classIds.append(classId)
                    confidences.append(float(confidence))
                    boxes.append([left, top, width, height])
-        indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, 0.4)
+    else:
+        print('Unknown output layer type: ' + lastLayer.type)
+        exit()
+
+    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
        i = i[0]
        box = boxes[i]