Unite deep learning object detection samples

2025-07-31 18:07:08 +08:00 · 2018-03-02 12:04:39 +03:00 · 2018-03-02 12:04:39 +03:00 · e8d94ea87c
commit e8d94ea87c
parent 667f5b655a
14 changed files with 555 additions and 995 deletions
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@ -18,40 +18,26 @@ VIDEO DEMO:
 Source Code
 -----------
-The latest version of sample source code can be downloaded [here](https://github.com/opencv/opencv/blob/master/samples/dnn/yolo_object_detection.cpp).
+Use a universal sample for object detection models written
 [in C++](https://github.com/opencv/opencv/blob/master/samples/dnn/object_detection.cpp) and
 [in Python](https://github.com/opencv/opencv/blob/master/samples/dnn/object_detection.py) languages
-@include dnn/yolo_object_detection.cpp
+Usage examples
-
+--------------
 How to compile in command line with pkg-config
 ----------------------------------------------
@code{.bash}
 # g++ `pkg-config --cflags opencv` `pkg-config --libs opencv` yolo_object_detection.cpp -o yolo_object_detection
@endcode
 Execute in webcam:
@code{.bash}
-$ yolo_object_detection -camera_device=0  -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights   -class_names=[PATH-TO-DARKNET]/data/coco.names
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392
@endcode
-Execute with image:
+Execute with image or video file:
@code{.bash}
-$ yolo_object_detection -source=[PATH-IMAGE]  -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights   -class_names=[PATH-TO-DARKNET]/data/coco.names
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input[PATH-TO-IMAGE-OR-VIDEO-FILE]
@endcode
 Execute in video file:
@code{.bash}
 $ yolo_object_detection -source=[PATH-TO-VIDEO] -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights   -class_names=[PATH-TO-DARKNET]/data/coco.names
@endcode
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -222,7 +222,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        /** @brief Returns index of output blob in output array.
         *  @see inputNameToIndex()
         */
-        virtual int outputNameToIndex(String outputName);
+        CV_WRAP virtual int outputNameToIndex(String outputName);
        /**
         * @brief Ask layer if it support specific backend for doing computations.
--- a/samples/dnn/README.md
+++ b/samples/dnn/README.md
@ -0,0 +1,20 @@
 # OpenCV deep learning module samples
 ## Model Zoo
 ### Object detection
 |    Model | Scale |   Size WxH|   Mean subtraction | Channels order |
 |---------------|-------|-----------|--------------------|-------|
 | [MobileNet-SSD, Caffe](https://github.com/chuanqi305/MobileNet-SSD/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | BGR |
 | [OpenCV face detector](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector) | `1.0` | `300x300` | `104 177 123` | BGR |
 | [SSDs from TensorFlow](https://github.com/tensorflow/models/tree/master/research/object_detection/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | RGB |
 | [YOLO](https://pjreddie.com/darknet/yolo/) | `0.00392 (1/255)` | `416x416` | `0 0 0` | RGB |
 | [VGG16-SSD](https://github.com/weiliu89/caffe/tree/ssd) | `1.0` | `300x300` | `104 117 123` | BGR |
 | [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn) | `1.0` | `800x600` | `102.9801, 115.9465, 122.7717` | BGR |
 | [R-FCN](https://github.com/YuwenXiong/py-R-FCN) | `1.0` | `800x600` | `102.9801 115.9465 122.7717` | BGR |
 ## References
 * [Models downloading script](https://github.com/opencv/opencv_extra/blob/master/testdata/dnn/download_models.py)
 * [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn)
 * [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
--- a/samples/dnn/faster_rcnn.cpp
+++ b/samples/dnn/faster_rcnn.cpp
@ -1,93 +0,0 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 using namespace cv;
 using namespace dnn;
 const char* keys =
    "{ help  h |     | print help message  }"
    "{ proto p |     | path to .prototxt   }"
    "{ model m |     | path to .caffemodel }"
    "{ image i |     | path to input image }"
    "{ conf  c | 0.8 | minimal confidence  }";
 const char* classNames[] = {
    "__background__",
    "aeroplane", "bicycle", "bird", "boat",
    "bottle", "bus", "car", "cat", "chair",
    "cow", "diningtable", "dog", "horse",
    "motorbike", "person", "pottedplant",
    "sheep", "sofa", "train", "tvmonitor"
 };
 static const int kInpWidth = 800;
 static const int kInpHeight = 600;
 int main(int argc, char** argv)
 {
    // Parse command line arguments.
    CommandLineParser parser(argc, argv, keys);
    parser.about("This sample is used to run Faster-RCNN and R-FCN object detection "
                 "models with OpenCV. You can get required models from "
                 "https://github.com/rbgirshick/py-faster-rcnn (Faster-RCNN) and from "
                 "https://github.com/YuwenXiong/py-R-FCN (R-FCN). Corresponding .prototxt "
                 "files may be found at https://github.com/opencv/opencv_extra/tree/master/testdata/dnn.");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }
    String protoPath = parser.get<String>("proto");
    String modelPath = parser.get<String>("model");
    String imagePath = parser.get<String>("image");
    float confThreshold = parser.get<float>("conf");
    CV_Assert(!protoPath.empty(), !modelPath.empty(), !imagePath.empty());
    // Load a model.
    Net net = readNetFromCaffe(protoPath, modelPath);
    Mat img = imread(imagePath);
    resize(img, img, Size(kInpWidth, kInpHeight));
    Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
    Mat imInfo = (Mat_<float>(1, 3) << img.rows, img.cols, 1.6f);
    net.setInput(blob, "data");
    net.setInput(imInfo, "im_info");
    // Draw detections.
    Mat detections = net.forward();
    const float* data = (float*)detections.data;
    for (size_t i = 0; i < detections.total(); i += 7)
    {
        // An every detection is a vector [id, classId, confidence, left, top, right, bottom]
        float confidence = data[i + 2];
        if (confidence > confThreshold)
        {
            int classId = (int)data[i + 1];
            int left = max(0, min((int)data[i + 3], img.cols - 1));
            int top = max(0, min((int)data[i + 4], img.rows - 1));
            int right = max(0, min((int)data[i + 5], img.cols - 1));
            int bottom = max(0, min((int)data[i + 6], img.rows - 1));
            // Draw a bounding box.
            rectangle(img, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
            // Put a label with a class name and confidence.
            String label = cv::format("%s, %.3f", classNames[classId], confidence);
            int baseLine;
            Size labelSize = cv::getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
            top = max(top, labelSize.height);
            rectangle(img, Point(left, top - labelSize.height),
                      Point(left + labelSize.width, top + baseLine),
                      Scalar(255, 255, 255), FILLED);
            putText(img, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0));
        }
    }
    imshow("frame", img);
    waitKey();
    return 0;
 }
--- a/samples/dnn/mobilenet_ssd_python.py
+++ b/samples/dnn/mobilenet_ssd_python.py
@ -1,132 +0,0 @@
 # This script is used to demonstrate MobileNet-SSD network using OpenCV deep learning module.
 #
 # It works with model taken from https://github.com/chuanqi305/MobileNet-SSD/ that
 # was trained in Caffe-SSD framework, https://github.com/weiliu89/caffe/tree/ssd.
 # Model detects objects from 20 classes.
 #
 # Also TensorFlow model from TensorFlow object detection model zoo may be used to
 # detect objects from 90 classes:
 # http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz
 # Text graph definition must be taken from opencv_extra:
 # https://github.com/opencv/opencv_extra/tree/master/testdata/dnn/ssd_mobilenet_v1_coco.pbtxt
 import numpy as np
 import argparse
 try:
    import cv2 as cv
 except ImportError:
    raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, '
                      'configure environment variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)')
 inWidth = 300
 inHeight = 300
 WHRatio = inWidth / float(inHeight)
 inScaleFactor = 0.007843
 meanVal = 127.5
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Script to run MobileNet-SSD object detection network '
                    'trained either in Caffe or TensorFlow frameworks.')
    parser.add_argument("--video", help="path to video file. If empty, camera's stream will be used")
    parser.add_argument("--prototxt", default="MobileNetSSD_deploy.prototxt",
                                      help='Path to text network file: '
                                           'MobileNetSSD_deploy.prototxt for Caffe model or '
                                           'ssd_mobilenet_v1_coco.pbtxt from opencv_extra for TensorFlow model')
    parser.add_argument("--weights", default="MobileNetSSD_deploy.caffemodel",
                                     help='Path to weights: '
                                          'MobileNetSSD_deploy.caffemodel for Caffe model or '
                                          'frozen_inference_graph.pb from TensorFlow.')
    parser.add_argument("--num_classes", default=20, type=int,
                        help="Number of classes. It's 20 for Caffe model from "
                             "https://github.com/chuanqi305/MobileNet-SSD/ and 90 for "
                             "TensorFlow model from http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz")
    parser.add_argument("--thr", default=0.2, type=float, help="confidence threshold to filter out weak detections")
    args = parser.parse_args()
    if args.num_classes == 20:
        net = cv.dnn.readNetFromCaffe(args.prototxt, args.weights)
        swapRB = False
        classNames = { 0: 'background',
            1: 'aeroplane', 2: 'bicycle', 3: 'bird', 4: 'boat',
            5: 'bottle', 6: 'bus', 7: 'car', 8: 'cat', 9: 'chair',
            10: 'cow', 11: 'diningtable', 12: 'dog', 13: 'horse',
            14: 'motorbike', 15: 'person', 16: 'pottedplant',
            17: 'sheep', 18: 'sofa', 19: 'train', 20: 'tvmonitor' }
    else:
        assert(args.num_classes == 90)
        net = cv.dnn.readNetFromTensorflow(args.weights, args.prototxt)
        swapRB = True
        classNames = { 0: 'background',
            1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus',
            7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant',
            13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat',
            18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear',
            24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag',
            32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard',
            37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove',
            41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle',
            46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon',
            51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',
            56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut',
            61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed',
            67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',
            75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven',
            80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock',
            86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush' }
    if args.video:
        cap = cv.VideoCapture(args.video)
    else:
        cap = cv.VideoCapture(0)
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        blob = cv.dnn.blobFromImage(frame, inScaleFactor, (inWidth, inHeight), (meanVal, meanVal, meanVal), swapRB)
        net.setInput(blob)
        detections = net.forward()
        cols = frame.shape[1]
        rows = frame.shape[0]
        if cols / float(rows) > WHRatio:
            cropSize = (int(rows * WHRatio), rows)
        else:
            cropSize = (cols, int(cols / WHRatio))
        y1 = int((rows - cropSize[1]) / 2)
        y2 = y1 + cropSize[1]
        x1 = int((cols - cropSize[0]) / 2)
        x2 = x1 + cropSize[0]
        frame = frame[y1:y2, x1:x2]
        cols = frame.shape[1]
        rows = frame.shape[0]
        for i in range(detections.shape[2]):
            confidence = detections[0, 0, i, 2]
            if confidence > args.thr:
                class_id = int(detections[0, 0, i, 1])
                xLeftBottom = int(detections[0, 0, i, 3] * cols)
                yLeftBottom = int(detections[0, 0, i, 4] * rows)
                xRightTop   = int(detections[0, 0, i, 5] * cols)
                yRightTop   = int(detections[0, 0, i, 6] * rows)
                cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop),
                              (0, 255, 0))
                if class_id in classNames:
                    label = classNames[class_id] + ": " + str(confidence)
                    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
                    yLeftBottom = max(yLeftBottom, labelSize[1])
                    cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]),
                                         (xLeftBottom + labelSize[0], yLeftBottom + baseLine),
                                         (255, 255, 255), cv.FILLED)
                    cv.putText(frame, label, (xLeftBottom, yLeftBottom),
                                cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
        cv.imshow("detections", frame)
        if cv.waitKey(1) >= 0:
            break
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@ -0,0 +1,255 @@
 #include <opencv2/opencv.hpp>
 #include <fstream>
 #include <iostream>
 #include <sstream>
 const char* keys =
    "{ help  h     | | Print help message. }"
    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
    "{ model m     | | Path to a binary file of model contains trained weights. "
                      "It could be a file with extensions .caffemodel (Caffe), "
                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
    "{ config c    | | Path to a text file of model contains network configuration. "
                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
    "{ classes     | | Optional path to a text file with names of classes to label detected objects. }"
    "{ mean        | | Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces. }"
    "{ scale       |  1 | Preprocess input image by multiplying on a scale factor. }"
    "{ width       | -1 | Preprocess input image by resizing to a specific width. }"
    "{ height      | -1 | Preprocess input image by resizing to a specific height. }"
    "{ rgb         |    | Indicate that model works with RGB input images instead BGR ones. }"
    "{ thr         | .5 | Confidence threshold. }"
    "{ opencl      |    | Enable OpenCL }";
 using namespace cv;
 using namespace dnn;
 float confThreshold;
 std::vector<std::string> classes;
 void loadClasses(const std::string& file);
 Net readNet(const std::string& model, const std::string& config = "", const std::string& framework = "");
 void postprocess(Mat& frame, const Mat& out, Net& net);
 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
 void callback(int pos, void* userdata);
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, keys);
    parser.about("Use this script to run object detection deep learning networks using OpenCV.");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }
    confThreshold = parser.get<float>("thr");
    float scale = parser.get<float>("scale");
    bool swapRB = parser.get<bool>("rgb");
    int inpWidth = parser.get<int>("width");
    int inpHeight = parser.get<int>("height");
    // Parse mean values.
    Scalar mean;
    if (parser.has("mean"))
    {
        std::istringstream meanStr(parser.get<String>("mean"));
        std::vector<float> meanValues;
        float val;
        while (meanStr >> val)
            meanValues.push_back(val);
        CV_Assert(meanValues.size() == 3);
        mean = Scalar(meanValues[0], meanValues[1], meanValues[2]);
    }
    // Open file with classes names.
    if (parser.has("classes"))
    {
        std::string file = parser.get<String>("classes");
        std::ifstream ifs(file.c_str());
        if (!ifs.is_open())
            CV_Error(Error::StsError, "File " + file + " not found");
        std::string line;
        while (ifs >> line)
        {
            classes.push_back(line);
        }
    }
    // Load a model.
    CV_Assert(parser.has("model"));
    Net net = readNet(parser.get<String>("model"), parser.get<String>("config"), parser.get<String>("framework"));
    if (parser.get<bool>("opencl"))
    {
        net.setPreferableTarget(DNN_TARGET_OPENCL);
    }
    // Create a window
    static const std::string kWinName = "Deep learning object detection in OpenCV";
    namedWindow(kWinName, WINDOW_NORMAL);
    int initialConf = confThreshold * 100;
    createTrackbar("Confidence threshold", kWinName, &initialConf, 99, callback);
    // Open a video file or an image file or a camera stream.
    VideoCapture cap;
    if (parser.has("input"))
        cap.open(parser.get<String>("input"));
    else
        cap.open(0);
    // Process frames.
    Mat frame, blob;
    while (waitKey(1) < 0)
    {
        cap >> frame;
        if (frame.empty())
        {
            waitKey();
            break;
        }
        // Create a 4D blob from a frame.
        Size inpSize(inpWidth > 0 ? inpWidth : frame.cols,
                     inpHeight > 0 ? inpHeight : frame.rows);
        blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false);
        // Run a model.
        net.setInput(blob);
        if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
        {
            resize(frame, frame, inpSize);
            Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
            net.setInput(imInfo, "im_info");
        }
        Mat out = net.forward();
        postprocess(frame, out, net);
        // Put efficiency information.
        std::vector<double> layersTimes;
        double t = net.getPerfProfile(layersTimes);
        std::string label = format("Inference time: %.2f", t * 1000 / getTickFrequency());
        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
        imshow(kWinName, frame);
    }
    return 0;
 }
 void postprocess(Mat& frame, const Mat& out, Net& net)
 {
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;
    float* data = (float*)out.data;
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
        for (size_t i = 0; i < out.total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
            {
                int left = data[i + 3];
                int top = data[i + 4];
                int right = data[i + 5];
                int bottom = data[i + 6];
                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
                drawPred(classId, confidence, left, top, right, bottom, frame);
            }
        }
    }
    else if (outLayerType == "DetectionOutput")
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
        for (size_t i = 0; i < out.total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
            {
                int left = (int)(data[i + 3] * frame.cols);
                int top = (int)(data[i + 4] * frame.rows);
                int right = (int)(data[i + 5] * frame.cols);
                int bottom = (int)(data[i + 6] * frame.rows);
                int classId = (int)(data[i + 1]) - 1;  // Skip 0th background class id.
                drawPred(classId, confidence, left, top, right, bottom, frame);
            }
        }
    }
    else if (outLayerType == "Region")
    {
        // Network produces output blob with a shape NxC where N is a number of
        // detected objects and C is a number of classes + 4 where the first 4
        // numbers are [center_x, center_y, width, height]
        for (int i = 0; i < out.rows; ++i, data += out.cols)
        {
            Mat confidences = out.row(i).colRange(5, out.cols);
            Point classIdPoint;
            double confidence;
            minMaxLoc(confidences, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold)
            {
                int classId = classIdPoint.x;
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;
                drawPred(classId, confidence, left, top, left + width, top + height, frame);
            }
        }
    }
    else
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
 }
 void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
 {
    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
    std::string label = format("%.2f", conf);
    if (!classes.empty())
    {
        CV_Assert(classId < (int)classes.size());
        label = classes[classId] + ": " + label;
    }
    int baseLine;
    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
    top = max(top, labelSize.height);
    rectangle(frame, Point(left, top - labelSize.height),
              Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
 }
 void callback(int pos, void*)
 {
    confThreshold = pos * 0.01;
 }
 Net readNet(const std::string& model, const std::string& config, const std::string& framework)
 {
    std::string modelExt = model.substr(model.find('.'));
    if (framework == "caffe" || modelExt == ".caffemodel")
        return readNetFromCaffe(config, model);
    else if (framework == "tensorflow" || modelExt == ".pb")
        return readNetFromTensorflow(model, config);
    else if (framework == "torch" || modelExt == ".t7" || modelExt == ".net")
        return readNetFromTorch(model);
    else if (framework == "darknet" || modelExt == ".weights")
        return readNetFromDarknet(config, model);
    else
        CV_Error(Error::StsError, "Cannot determine an origin framework of model from file " + model);
    return Net();
 }
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@ -0,0 +1,161 @@
 import cv2 as cv
 import argparse
 import sys
 import numpy as np
 parser = argparse.ArgumentParser(description='Use this script to run object detection deep learning networks using OpenCV.')
 parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
 parser.add_argument('--model', required=True,
                    help='Path to a binary file of model contains trained weights. '
                         'It could be a file with extensions .caffemodel (Caffe), '
                         '.pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet)')
 parser.add_argument('--config',
                    help='Path to a text file of model contains network configuration. '
                         'It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet)')
 parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
                    help='Optional name of an origin framework of the model. '
                         'Detect it automatically if it does not set.')
 parser.add_argument('--classes', help='Optional path to a text file with names of classes to label detected objects.')
 parser.add_argument('--mean', nargs='+', type=float, default=[0, 0, 0],
                    help='Preprocess input image by subtracting mean values. '
                         'Mean values should be in BGR order.')
 parser.add_argument('--scale', type=float, default=1.0,
                    help='Preprocess input image by multiplying on a scale factor.')
 parser.add_argument('--width', type=int,
                    help='Preprocess input image by resizing to a specific width.')
 parser.add_argument('--height', type=int,
                    help='Preprocess input image by resizing to a specific height.')
 parser.add_argument('--rgb', action='store_true',
                    help='Indicate that model works with RGB input images instead BGR ones.')
 parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
 args = parser.parse_args()
 # Load names of classes
 classes = None
 if args.classes:
    with open(args.classes, 'rt') as f:
        classes = f.read().rstrip('\n').split('\n')
 # Load a network
 modelExt = args.model[args.model.find('.'):]
 if args.framework == 'caffe' or modelExt == '.caffemodel':
    net = cv.dnn.readNetFromCaffe(args.config, args.model)
 elif args.framework == 'tensorflow' or modelExt == '.pb':
    net = cv.dnn.readNetFromTensorflow(args.model, args.config)
 elif args.framework == 'torch' or modelExt in ['.t7', '.net']:
    net = cv.dnn.readNetFromTorch(args.model)
 elif args.framework == 'darknet' or modelExt == '.weights':
    net = cv.dnn.readNetFromDarknet(args.config, args.model)
 else:
    print('Cannot determine an origin framework of model from file %s' % args.model)
    sys.exit(0)
 confThreshold = args.thr
 def postprocess(frame, out):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]
    def drawPred(classId, conf, left, top, right, bottom):
        # Draw a bounding box.
        cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
        label = '%.2f' % confidence
        # Print a label of class.
        if classes:
            assert(classId < len(classes))
            label = '%s: %s' % (classes[classId], label)
        labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        top = max(top, labelSize[1])
        cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
        cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
    layerNames = net.getLayerNames()
    lastLayerId = net.getLayerId(layerNames[-1])
    lastLayer = net.getLayer(lastLayerId)
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
                left = int(detection[3])
                top = int(detection[4])
                right = int(detection[5])
                bottom = int(detection[6])
                classId = int(detection[1]) - 1  # Skip background label
                drawPred(classId, confidence, left, top, right, bottom)
    elif lastLayer.type == 'DetectionOutput':
        # Network produces output blob with a shape 1x1xNx7 where N is a number of
        # detections and an every detection is a vector of values
        # [batchId, classId, confidence, left, top, right, bottom]
        for detection in out[0, 0]:
            confidence = detection[2]
            if confidence > confThreshold:
                left = int(detection[3] * frameWidth)
                top = int(detection[4] * frameHeight)
                right = int(detection[5] * frameWidth)
                bottom = int(detection[6] * frameHeight)
                classId = int(detection[1]) - 1  # Skip background label
                drawPred(classId, confidence, left, top, right, bottom)
    elif lastLayer.type == 'Region':
        # Network produces output blob with a shape NxC where N is a number of
        # detected objects and C is a number of classes + 4 where the first 4
        # numbers are [center_x, center_y, width, height]
        for detection in out:
            confidences = detection[5:]
            classId = np.argmax(confidences)
            confidence = confidences[classId]
            if confidence > confThreshold:
                center_x = int(detection[0] * frameWidth)
                center_y = int(detection[1] * frameHeight)
                width = int(detection[2] * frameWidth)
                height = int(detection[3] * frameHeight)
                left = center_x - width / 2
                top = center_y - height / 2
                drawPred(classId, confidence, left, top, left + width, top + height)
 # Process inputs
 winName = 'Deep learning object detection in OpenCV'
 cv.namedWindow(winName, cv.WINDOW_NORMAL)
 def callback(pos):
    global confThreshold
    confThreshold = pos / 100.0
 cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)
 cap = cv.VideoCapture(args.input if args.input else 0)
 while cv.waitKey(1) < 0:
    hasFrame, frame = cap.read()
    if not hasFrame:
        cv.waitKey()
        break
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]
    # Create a 4D blob from a frame.
    inpWidth = args.width if args.width else frameWidth
    inpHeight = args.height if args.height else frameHeight
    blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
    # Run a model
    net.setInput(blob)
    if net.getLayer(0).outputNameToIndex('im_info') != -1:  # Faster-RCNN or R-FCN
        frame = cv.resize(frame, (inpWidth, inpHeight))
        net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info');
    out = net.forward()
    postprocess(frame, out)
    # Put efficiency information.
    t, _ = net.getPerfProfile()
    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
    cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
    cv.imshow(winName, frame)
--- a/samples/dnn/object_detection_classes_coco.txt
+++ b/samples/dnn/object_detection_classes_coco.txt
@ -0,0 +1,90 @@
 person
 bicycle
 car
 motorcycle
 airplane
 bus
 train
 truck
 boat
 traffic light
 fire hydrant
 stop sign
 parking meter
 bench
 bird
 cat
 dog
 horse
 sheep
 cow
 elephant
 bear
 zebra
 giraffe
 backpack
 umbrella
 handbag
 tie
 suitcase
 frisbee
 skis
 snowboard
 sports ball
 kite
 baseball bat
 baseball glove
 skateboard
 surfboard
 tennis racket
 bottle
 wine glass
 cup
 fork
 knife
 spoon
 bowl
 banana
 apple
 sandwich
 orange
 broccoli
 carrot
 hot dog
 pizza
 donut
 cake
 chair
 couch
 potted plant
 bed
 dining table
 toilet
 tv
 laptop
 mouse
 remote
 keyboard
 cell phone
 microwave
 oven
 toaster
 sink
 refrigerator
 book
 clock
 vase
 scissors
 teddy bear
 hair drier
 toothbrush
--- a/samples/dnn/object_detection_classes_pascal_voc.txt
+++ b/samples/dnn/object_detection_classes_pascal_voc.txt
@ -0,0 +1,20 @@
 aeroplane
 bicycle
 bird
 boat
 bottle
 bus
 car
 cat
 chair
 cow
 diningtable
 dog
 horse
 motorbike
 person
 pottedplant
 sheep
 sofa
 train
 tvmonitor
--- a/samples/dnn/resnet_ssd_face.cpp
+++ b/samples/dnn/resnet_ssd_face.cpp
@ -1,164 +0,0 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 using namespace cv::dnn;
 const size_t inWidth = 300;
 const size_t inHeight = 300;
 const double inScaleFactor = 1.0;
 const Scalar meanVal(104.0, 177.0, 123.0);
 const char* about = "This sample uses Single-Shot Detector "
                    "(https://arxiv.org/abs/1512.02325) "
                    "with ResNet-10 architecture to detect faces on camera/video/image.\n"
                    "More information about the training is available here: "
                    "<OPENCV_SRC_DIR>/samples/dnn/face_detector/how_to_train_face_detector.txt\n"
                    ".caffemodel model's file is available here: "
                    "<OPENCV_SRC_DIR>/samples/dnn/face_detector/res10_300x300_ssd_iter_140000.caffemodel\n"
                    ".prototxt file is available here: "
                    "<OPENCV_SRC_DIR>/samples/dnn/face_detector/deploy.prototxt\n";
 const char* params
    = "{ help           | false | print usage          }"
      "{ proto          |       | model configuration (deploy.prototxt) }"
      "{ model          |       | model weights (res10_300x300_ssd_iter_140000.caffemodel) }"
      "{ camera_device  | 0     | camera device number }"
      "{ video          |       | video or image for detection }"
      "{ opencl         | false | enable OpenCL }"
      "{ min_confidence | 0.5   | min confidence       }";
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, params);
    if (parser.get<bool>("help"))
    {
        cout << about << endl;
        parser.printMessage();
        return 0;
    }
    String modelConfiguration = parser.get<string>("proto");
    String modelBinary = parser.get<string>("model");
    //! [Initialize network]
    dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary);
    //! [Initialize network]
    if (net.empty())
    {
        cerr << "Can't load network by using the following files: " << endl;
        cerr << "prototxt:   " << modelConfiguration << endl;
        cerr << "caffemodel: " << modelBinary << endl;
        cerr << "Models are available here:" << endl;
        cerr << "<OPENCV_SRC_DIR>/samples/dnn/face_detector" << endl;
        cerr << "or here:" << endl;
        cerr << "https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector" << endl;
        exit(-1);
    }
    if (parser.get<bool>("opencl"))
    {
        net.setPreferableTarget(DNN_TARGET_OPENCL);
    }
    VideoCapture cap;
    if (parser.get<String>("video").empty())
    {
        int cameraDevice = parser.get<int>("camera_device");
        cap = VideoCapture(cameraDevice);
        if(!cap.isOpened())
        {
            cout << "Couldn't find camera: " << cameraDevice << endl;
            return -1;
        }
    }
    else
    {
        cap.open(parser.get<String>("video"));
        if(!cap.isOpened())
        {
            cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
            return -1;
        }
    }
    for(;;)
    {
        Mat frame;
        cap >> frame; // get a new frame from camera/video or read image
        if (frame.empty())
        {
            waitKey();
            break;
        }
        if (frame.channels() == 4)
            cvtColor(frame, frame, COLOR_BGRA2BGR);
        //! [Prepare blob]
        Mat inputBlob = blobFromImage(frame, inScaleFactor,
                                      Size(inWidth, inHeight), meanVal, false, false); //Convert Mat to batch of images
        //! [Prepare blob]
        //! [Set input blob]
        net.setInput(inputBlob, "data"); //set the network input
        //! [Set input blob]
        //! [Make forward pass]
        Mat detection = net.forward("detection_out"); //compute output
        //! [Make forward pass]
        vector<double> layersTimings;
        double freq = getTickFrequency() / 1000;
        double time = net.getPerfProfile(layersTimings) / freq;
        Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
        ostringstream ss;
        ss << "FPS: " << 1000/time << " ; time: " << time << " ms";
        putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255));
        float confidenceThreshold = parser.get<float>("min_confidence");
        for(int i = 0; i < detectionMat.rows; i++)
        {
            float confidence = detectionMat.at<float>(i, 2);
            if(confidence > confidenceThreshold)
            {
                int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
                int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
                int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
                int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
                Rect object((int)xLeftBottom, (int)yLeftBottom,
                            (int)(xRightTop - xLeftBottom),
                            (int)(yRightTop - yLeftBottom));
                rectangle(frame, object, Scalar(0, 255, 0));
                ss.str("");
                ss << confidence;
                String conf(ss.str());
                String label = "Face: " + conf;
                int baseLine = 0;
                Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
                rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height),
                                      Size(labelSize.width, labelSize.height + baseLine)),
                          Scalar(255, 255, 255), FILLED);
                putText(frame, label, Point(xLeftBottom, yLeftBottom),
                        FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
            }
        }
        imshow("detections", frame);
        if (waitKey(1) >= 0) break;
    }
    return 0;
 } // main
--- a/samples/dnn/resnet_ssd_face_python.py
+++ b/samples/dnn/resnet_ssd_face_python.py
@ -1,55 +0,0 @@
 import numpy as np
 import argparse
 import cv2 as cv
 try:
    import cv2 as cv
 except ImportError:
    raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, '
                      'configure environment variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)')
 from cv2 import dnn
 inWidth = 300
 inHeight = 300
 confThreshold = 0.5
 prototxt = 'face_detector/deploy.prototxt'
 caffemodel = 'face_detector/res10_300x300_ssd_iter_140000.caffemodel'
 if __name__ == '__main__':
    net = dnn.readNetFromCaffe(prototxt, caffemodel)
    cap = cv.VideoCapture(0)
    while True:
        ret, frame = cap.read()
        cols = frame.shape[1]
        rows = frame.shape[0]
        net.setInput(dnn.blobFromImage(frame, 1.0, (inWidth, inHeight), (104.0, 177.0, 123.0), False, False))
        detections = net.forward()
        perf_stats = net.getPerfProfile()
        print('Inference time, ms: %.2f' % (perf_stats[0] / cv.getTickFrequency() * 1000))
        for i in range(detections.shape[2]):
            confidence = detections[0, 0, i, 2]
            if confidence > confThreshold:
                xLeftBottom = int(detections[0, 0, i, 3] * cols)
                yLeftBottom = int(detections[0, 0, i, 4] * rows)
                xRightTop = int(detections[0, 0, i, 5] * cols)
                yRightTop = int(detections[0, 0, i, 6] * rows)
                cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop),
                             (0, 255, 0))
                label = "face: %.4f" % confidence
                labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
                cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]),
                                    (xLeftBottom + labelSize[0], yLeftBottom + baseLine),
                                    (255, 255, 255), cv.FILLED)
                cv.putText(frame, label, (xLeftBottom, yLeftBottom),
                           cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
        cv.imshow("detections", frame)
        if cv.waitKey(1) != -1:
            break
--- a/samples/dnn/ssd_mobilenet_object_detection.cpp
+++ b/samples/dnn/ssd_mobilenet_object_detection.cpp
@ -1,187 +0,0 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 using namespace cv::dnn;
 const size_t inWidth = 300;
 const size_t inHeight = 300;
 const float inScaleFactor = 0.007843f;
 const float meanVal = 127.5;
 const char* classNames[] = {"background",
                            "aeroplane", "bicycle", "bird", "boat",
                            "bottle", "bus", "car", "cat", "chair",
                            "cow", "diningtable", "dog", "horse",
                            "motorbike", "person", "pottedplant",
                            "sheep", "sofa", "train", "tvmonitor"};
 const String keys
    = "{ help           | false | print usage         }"
      "{ proto          | MobileNetSSD_deploy.prototxt   | model configuration }"
      "{ model          | MobileNetSSD_deploy.caffemodel | model weights }"
      "{ camera_device  | 0     | camera device number }"
      "{ camera_width   | 640   | camera device width  }"
      "{ camera_height  | 480   | camera device height }"
      "{ video          |       | video or image for detection}"
      "{ out            |       | path to output video file}"
      "{ min_confidence | 0.2   | min confidence      }"
      "{ opencl         | false | enable OpenCL }"
 ;
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, keys);
    parser.about("This sample uses MobileNet Single-Shot Detector "
                 "(https://arxiv.org/abs/1704.04861) "
                 "to detect objects on camera/video/image.\n"
                 ".caffemodel model's file is available here: "
                 "https://github.com/chuanqi305/MobileNet-SSD\n"
                 "Default network is 300x300 and 20-classes VOC.\n");
    if (parser.get<bool>("help"))
    {
        parser.printMessage();
        return 0;
    }
    String modelConfiguration = parser.get<String>("proto");
    String modelBinary = parser.get<String>("model");
    CV_Assert(!modelConfiguration.empty() && !modelBinary.empty());
    //! [Initialize network]
    dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary);
    //! [Initialize network]
    if (parser.get<bool>("opencl"))
    {
        net.setPreferableTarget(DNN_TARGET_OPENCL);
    }
    if (net.empty())
    {
        cerr << "Can't load network by using the following files: " << endl;
        cerr << "prototxt:   " << modelConfiguration << endl;
        cerr << "caffemodel: " << modelBinary << endl;
        cerr << "Models can be downloaded here:" << endl;
        cerr << "https://github.com/chuanqi305/MobileNet-SSD" << endl;
        exit(-1);
    }
    VideoCapture cap;
    if (!parser.has("video"))
    {
        int cameraDevice = parser.get<int>("camera_device");
        cap = VideoCapture(cameraDevice);
        if(!cap.isOpened())
        {
            cout << "Couldn't find camera: " << cameraDevice << endl;
            return -1;
        }
        cap.set(CAP_PROP_FRAME_WIDTH, parser.get<int>("camera_width"));
        cap.set(CAP_PROP_FRAME_HEIGHT, parser.get<int>("camera_height"));
    }
    else
    {
        cap.open(parser.get<String>("video"));
        if(!cap.isOpened())
        {
            cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
            return -1;
        }
    }
    //Acquire input size
    Size inVideoSize((int) cap.get(CAP_PROP_FRAME_WIDTH),
                     (int) cap.get(CAP_PROP_FRAME_HEIGHT));
    double fps = cap.get(CAP_PROP_FPS);
    int fourcc = static_cast<int>(cap.get(CAP_PROP_FOURCC));
    VideoWriter outputVideo;
    outputVideo.open(parser.get<String>("out") ,
                     (fourcc != 0 ? fourcc : VideoWriter::fourcc('M','J','P','G')),
                     (fps != 0 ? fps : 10.0), inVideoSize, true);
    for(;;)
    {
        Mat frame;
        cap >> frame; // get a new frame from camera/video or read image
        if (frame.empty())
        {
            waitKey();
            break;
        }
        if (frame.channels() == 4)
            cvtColor(frame, frame, COLOR_BGRA2BGR);
        //! [Prepare blob]
        Mat inputBlob = blobFromImage(frame, inScaleFactor,
                                      Size(inWidth, inHeight),
                                      Scalar(meanVal, meanVal, meanVal),
                                      false, false); //Convert Mat to batch of images
        //! [Prepare blob]
        //! [Set input blob]
        net.setInput(inputBlob); //set the network input
        //! [Set input blob]
        //! [Make forward pass]
        Mat detection = net.forward(); //compute output
        //! [Make forward pass]
        vector<double> layersTimings;
        double freq = getTickFrequency() / 1000;
        double time = net.getPerfProfile(layersTimings) / freq;
        Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
        if (!outputVideo.isOpened())
        {
            putText(frame, format("FPS: %.2f ; time: %.2f ms", 1000.f/time, time),
                    Point(20,20), 0, 0.5, Scalar(0,0,255));
        }
        else
            cout << "Inference time, ms: " << time << endl;
        float confidenceThreshold = parser.get<float>("min_confidence");
        for(int i = 0; i < detectionMat.rows; i++)
        {
            float confidence = detectionMat.at<float>(i, 2);
            if(confidence > confidenceThreshold)
            {
                size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));
                int left = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
                int top = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
                int right = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
                int bottom = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
                rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
                String label = format("%s: %.2f", classNames[objectClass], confidence);
                int baseLine = 0;
                Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
                top = max(top, labelSize.height);
                rectangle(frame, Point(left, top - labelSize.height),
                          Point(left + labelSize.width, top + baseLine),
                          Scalar(255, 255, 255), FILLED);
                putText(frame, label, Point(left, top),
                        FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
            }
        }
        if (outputVideo.isOpened())
            outputVideo << frame;
        imshow("detections", frame);
        if (waitKey(1) >= 0) break;
    }
    return 0;
 } // main
--- a/samples/dnn/ssd_object_detection.cpp
+++ b/samples/dnn/ssd_object_detection.cpp
@ -1,156 +0,0 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 using namespace cv::dnn;
 const char* classNames[] = {"background",
                            "aeroplane", "bicycle", "bird", "boat",
                            "bottle", "bus", "car", "cat", "chair",
                            "cow", "diningtable", "dog", "horse",
                            "motorbike", "person", "pottedplant",
                            "sheep", "sofa", "train", "tvmonitor"};
 const char* about = "This sample uses Single-Shot Detector "
                    "(https://arxiv.org/abs/1512.02325) "
                    "to detect objects on camera/video/image.\n"
                    ".caffemodel model's file is available here: "
                    "https://github.com/weiliu89/caffe/tree/ssd#models\n"
                    "Default network is 300x300 and 20-classes VOC.\n";
 const char* params
    = "{ help           | false | print usage         }"
      "{ proto          |       | model configuration }"
      "{ model          |       | model weights       }"
      "{ camera_device  | 0     | camera device number}"
      "{ video          |       | video or image for detection}"
      "{ min_confidence | 0.5   | min confidence      }";
 int main(int argc, char** argv)
 {
    cv::CommandLineParser parser(argc, argv, params);
    if (parser.get<bool>("help"))
    {
        cout << about << endl;
        parser.printMessage();
        return 0;
    }
    String modelConfiguration = parser.get<string>("proto");
    String modelBinary = parser.get<string>("model");
    //! [Initialize network]
    dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary);
    //! [Initialize network]
    if (net.empty())
    {
        cerr << "Can't load network by using the following files: " << endl;
        cerr << "prototxt:   " << modelConfiguration << endl;
        cerr << "caffemodel: " << modelBinary << endl;
        cerr << "Models can be downloaded here:" << endl;
        cerr << "https://github.com/weiliu89/caffe/tree/ssd#models" << endl;
        exit(-1);
    }
    VideoCapture cap;
    if (parser.get<String>("video").empty())
    {
        int cameraDevice = parser.get<int>("camera_device");
        cap = VideoCapture(cameraDevice);
        if(!cap.isOpened())
        {
            cout << "Couldn't find camera: " << cameraDevice << endl;
            return -1;
        }
    }
    else
    {
        cap.open(parser.get<String>("video"));
        if(!cap.isOpened())
        {
            cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
            return -1;
        }
    }
    for (;;)
    {
        cv::Mat frame;
        cap >> frame; // get a new frame from camera/video or read image
        if (frame.empty())
        {
            waitKey();
            break;
        }
        if (frame.channels() == 4)
            cvtColor(frame, frame, COLOR_BGRA2BGR);
        //! [Prepare blob]
        Mat inputBlob = blobFromImage(frame, 1.0f, Size(300, 300), Scalar(104, 117, 123), false, false); //Convert Mat to batch of images
        //! [Prepare blob]
        //! [Set input blob]
        net.setInput(inputBlob, "data"); //set the network input
        //! [Set input blob]
        //! [Make forward pass]
        Mat detection = net.forward("detection_out"); //compute output
        //! [Make forward pass]
        vector<double> layersTimings;
        double freq = getTickFrequency() / 1000;
        double time = net.getPerfProfile(layersTimings) / freq;
        ostringstream ss;
        ss << "FPS: " << 1000/time << " ; time: " << time << " ms";
        putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255));
        Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
        float confidenceThreshold = parser.get<float>("min_confidence");
        for(int i = 0; i < detectionMat.rows; i++)
        {
            float confidence = detectionMat.at<float>(i, 2);
            if(confidence > confidenceThreshold)
            {
                size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));
                int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
                int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
                int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
                int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
                ss.str("");
                ss << confidence;
                String conf(ss.str());
                Rect object(xLeftBottom, yLeftBottom,
                            xRightTop - xLeftBottom,
                            yRightTop - yLeftBottom);
                rectangle(frame, object, Scalar(0, 255, 0));
                String label = String(classNames[objectClass]) + ": " + conf;
                int baseLine = 0;
                Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
                rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height),
                                      Size(labelSize.width, labelSize.height + baseLine)),
                          Scalar(255, 255, 255), FILLED);
                putText(frame, label, Point(xLeftBottom, yLeftBottom),
                        FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
            }
        }
        imshow("detections", frame);
        if (waitKey(1) >= 0) break;
    }
    return 0;
 } // main
--- a/samples/dnn/yolo_object_detection.cpp
+++ b/samples/dnn/yolo_object_detection.cpp
@ -1,185 +0,0 @@
 // Brief Sample of using OpenCV dnn module in real time with device capture, video and image.
 // VIDEO DEMO: https://www.youtube.com/watch?v=NHtRlndE2cg
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <fstream>
 #include <iostream>
 using namespace std;
 using namespace cv;
 using namespace cv::dnn;
 static const char* about =
 "This sample uses You only look once (YOLO)-Detector (https://arxiv.org/abs/1612.08242) to detect objects on camera/video/image.\n"
 "Models can be downloaded here: https://pjreddie.com/darknet/yolo/\n"
 "Default network is 416x416.\n"
 "Class names can be downloaded here: https://github.com/pjreddie/darknet/tree/master/data\n";
 static const char* params =
 "{ help           | false | print usage         }"
 "{ cfg            |       | model configuration }"
 "{ model          |       | model weights       }"
 "{ camera_device  | 0     | camera device number}"
 "{ source         |       | video or image for detection}"
 "{ out            |       | path to output video file}"
 "{ fps            | 3     | frame per second }"
 "{ style          | box   | box or line style draw }"
 "{ min_confidence | 0.24  | min confidence      }"
 "{ class_names    |       | File with class names, [PATH-TO-DARKNET]/data/coco.names }";
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, params);
    if (parser.get<bool>("help"))
    {
        cout << about << endl;
        parser.printMessage();
        return 0;
    }
    String modelConfiguration = parser.get<String>("cfg");
    String modelBinary = parser.get<String>("model");
    //! [Initialize network]
    dnn::Net net = readNetFromDarknet(modelConfiguration, modelBinary);
    //! [Initialize network]
    if (net.empty())
    {
        cerr << "Can't load network by using the following files: " << endl;
        cerr << "cfg-file:     " << modelConfiguration << endl;
        cerr << "weights-file: " << modelBinary << endl;
        cerr << "Models can be downloaded here:" << endl;
        cerr << "https://pjreddie.com/darknet/yolo/" << endl;
        exit(-1);
    }
    VideoCapture cap;
    VideoWriter writer;
    int codec = CV_FOURCC('M', 'J', 'P', 'G');
    double fps = parser.get<float>("fps");
    if (parser.get<String>("source").empty())
    {
        int cameraDevice = parser.get<int>("camera_device");
        cap = VideoCapture(cameraDevice);
        if(!cap.isOpened())
        {
            cout << "Couldn't find camera: " << cameraDevice << endl;
            return -1;
        }
    }
    else
    {
        cap.open(parser.get<String>("source"));
        if(!cap.isOpened())
        {
            cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
            return -1;
        }
    }
    if(!parser.get<String>("out").empty())
    {
        writer.open(parser.get<String>("out"), codec, fps, Size((int)cap.get(CAP_PROP_FRAME_WIDTH),(int)cap.get(CAP_PROP_FRAME_HEIGHT)), 1);
    }
    vector<String> classNamesVec;
    ifstream classNamesFile(parser.get<String>("class_names").c_str());
    if (classNamesFile.is_open())
    {
        string className = "";
        while (std::getline(classNamesFile, className))
            classNamesVec.push_back(className);
    }
    String object_roi_style = parser.get<String>("style");
    for(;;)
    {
        Mat frame;
        cap >> frame; // get a new frame from camera/video or read image
        if (frame.empty())
        {
            waitKey();
            break;
        }
        if (frame.channels() == 4)
            cvtColor(frame, frame, COLOR_BGRA2BGR);
        //! [Prepare blob]
        Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false); //Convert Mat to batch of images
        //! [Prepare blob]
        //! [Set input blob]
        net.setInput(inputBlob, "data");                   //set the network input
        //! [Set input blob]
        //! [Make forward pass]
        Mat detectionMat = net.forward("detection_out");   //compute output
        //! [Make forward pass]
        vector<double> layersTimings;
        double tick_freq = getTickFrequency();
        double time_ms = net.getPerfProfile(layersTimings) / tick_freq * 1000;
        putText(frame, format("FPS: %.2f ; time: %.2f ms", 1000.f / time_ms, time_ms),
                Point(20, 20), 0, 0.5, Scalar(0, 0, 255));
        float confidenceThreshold = parser.get<float>("min_confidence");
        for (int i = 0; i < detectionMat.rows; i++)
        {
            const int probability_index = 5;
            const int probability_size = detectionMat.cols - probability_index;
            float *prob_array_ptr = &detectionMat.at<float>(i, probability_index);
            size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
            float confidence = detectionMat.at<float>(i, (int)objectClass + probability_index);
            if (confidence > confidenceThreshold)
            {
                float x_center = detectionMat.at<float>(i, 0) * frame.cols;
                float y_center = detectionMat.at<float>(i, 1) * frame.rows;
                float width = detectionMat.at<float>(i, 2) * frame.cols;
                float height = detectionMat.at<float>(i, 3) * frame.rows;
                Point p1(cvRound(x_center - width / 2), cvRound(y_center - height / 2));
                Point p2(cvRound(x_center + width / 2), cvRound(y_center + height / 2));
                Rect object(p1, p2);
                Scalar object_roi_color(0, 255, 0);
                if (object_roi_style == "box")
                {
                    rectangle(frame, object, object_roi_color);
                }
                else
                {
                    Point p_center(cvRound(x_center), cvRound(y_center));
                    line(frame, object.tl(), p_center, object_roi_color, 1);
                }
                String className = objectClass < classNamesVec.size() ? classNamesVec[objectClass] : cv::format("unknown(%d)", objectClass);
                String label = format("%s: %.2f", className.c_str(), confidence);
                int baseLine = 0;
                Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
                rectangle(frame, Rect(p1, Size(labelSize.width, labelSize.height + baseLine)),
                          object_roi_color, FILLED);
                putText(frame, label, p1 + Point(0, labelSize.height),
                        FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
            }
        }
        if(writer.isOpened())
        {
            writer.write(frame);
        }
        imshow("YOLO: Detections", frame);
        if (waitKey(1) >= 0) break;
    }
    return 0;
 } // main