From 48e07437f0ffd9f03ae58530e3aec57b03ee0e6b Mon Sep 17 00:00:00 2001 From: catree Date: Wed, 25 Oct 2017 22:02:35 +0200 Subject: [PATCH] Add camera/video/image input for C++ DNN object detection samples. Add nice display and computation time. --- .../dnn/ssd_mobilenet_object_detection.cpp | 76 +++++-- samples/dnn/ssd_object_detection.cpp | 156 +++++++++----- samples/dnn/yolo_object_detection.cpp | 198 ++++++++++++------ 3 files changed, 298 insertions(+), 132 deletions(-) diff --git a/samples/dnn/ssd_mobilenet_object_detection.cpp b/samples/dnn/ssd_mobilenet_object_detection.cpp index 283c08a36c..895e548bf8 100644 --- a/samples/dnn/ssd_mobilenet_object_detection.cpp +++ b/samples/dnn/ssd_mobilenet_object_detection.cpp @@ -23,23 +23,25 @@ const char* classNames[] = {"background", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"}; -const char* about = "This sample uses Single-Shot Detector " - "(https://arxiv.org/abs/1512.02325)" - "to detect objects on image.\n" - ".caffemodel model's file is avaliable here: " - "https://github.com/chuanqi305/MobileNet-SSD\n"; +const char* about = "This sample uses MobileNet Single-Shot Detector " + "(https://arxiv.org/abs/1704.04861) " + "to detect objects on camera/video/image.\n" + ".caffemodel model's file is available here: " + "https://github.com/chuanqi305/MobileNet-SSD\n" + "Default network is 300x300 and 20-classes VOC.\n"; const char* params = "{ help | false | print usage }" "{ proto | MobileNetSSD_deploy.prototxt | model configuration }" "{ model | MobileNetSSD_deploy.caffemodel | model weights }" - "{ video | | video for detection }" + "{ camera_device | 0 | camera device number }" + "{ video | | video or image for detection}" "{ out | | path to output video file}" "{ min_confidence | 0.2 | min confidence }"; int main(int argc, char** argv) { - cv::CommandLineParser parser(argc, argv, params); + CommandLineParser parser(argc, argv, params); if (parser.get("help")) { @@ -55,19 +57,40 @@ int main(int argc, char** argv) dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary); //! [Initialize network] - VideoCapture cap(parser.get("video")); - if(!cap.isOpened()) // check if we succeeded + if (net.empty()) { - cap = VideoCapture(0); + cerr << "Can't load network by using the following files: " << endl; + cerr << "prototxt: " << modelConfiguration << endl; + cerr << "caffemodel: " << modelBinary << endl; + cerr << "Models can be downloaded here:" << endl; + cerr << "https://github.com/chuanqi305/MobileNet-SSD" << endl; + exit(-1); + } + + VideoCapture cap; + if (parser.get("video").empty()) + { + int cameraDevice = parser.get("camera_device"); + cap = VideoCapture(cameraDevice); if(!cap.isOpened()) { - cout << "Couldn't find camera" << endl; + cout << "Couldn't find camera: " << cameraDevice << endl; + return -1; + } + } + else + { + cap.open(parser.get("video")); + if(!cap.isOpened()) + { + cout << "Couldn't open image or video: " << parser.get("video") << endl; return -1; } } - Size inVideoSize = Size((int) cap.get(CV_CAP_PROP_FRAME_WIDTH), //Acquire input size - (int) cap.get(CV_CAP_PROP_FRAME_HEIGHT)); + Size inVideoSize; + inVideoSize = Size((int) cap.get(CV_CAP_PROP_FRAME_WIDTH), //Acquire input size + (int) cap.get(CV_CAP_PROP_FRAME_HEIGHT)); Size cropSize; if (inVideoSize.width / (float)inVideoSize.height > WHRatio) @@ -93,9 +116,18 @@ int main(int argc, char** argv) for(;;) { Mat frame; - cap >> frame; // get a new frame from camera - //! [Prepare blob] + cap >> frame; // get a new frame from camera/video or read image + if (frame.empty()) + { + waitKey(); + break; + } + + if (frame.channels() == 4) + cvtColor(frame, frame, COLOR_BGRA2BGR); + + //! [Prepare blob] Mat inputBlob = blobFromImage(frame, inScaleFactor, Size(inWidth, inHeight), meanVal, false); //Convert Mat to batch of images //! [Prepare blob] @@ -108,15 +140,23 @@ int main(int argc, char** argv) Mat detection = net.forward("detection_out"); //compute output //! [Make forward pass] - std::vector layersTimings; + vector layersTimings; double freq = getTickFrequency() / 1000; double time = net.getPerfProfile(layersTimings) / freq; - cout << "Inference time, ms: " << time << endl; Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr()); frame = frame(crop); + ostringstream ss; + if (!outputVideo.isOpened()) + { + ss << "FPS: " << 1000/time << " ; time: " << time << " ms"; + putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255)); + } + else + cout << "Inference time, ms: " << time << endl; + float confidenceThreshold = parser.get("min_confidence"); for(int i = 0; i < detectionMat.rows; i++) { @@ -131,7 +171,7 @@ int main(int argc, char** argv) int xRightTop = static_cast(detectionMat.at(i, 5) * frame.cols); int yRightTop = static_cast(detectionMat.at(i, 6) * frame.rows); - ostringstream ss; + ss.str(""); ss << confidence; String conf(ss.str()); diff --git a/samples/dnn/ssd_object_detection.cpp b/samples/dnn/ssd_object_detection.cpp index 09e983f2b3..3f89510f9f 100644 --- a/samples/dnn/ssd_object_detection.cpp +++ b/samples/dnn/ssd_object_detection.cpp @@ -40,15 +40,26 @@ static Mat preprocess(const Mat& frame) return preprocessed; } +const char* classNames[] = {"background", + "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", + "motorbike", "person", "pottedplant", + "sheep", "sofa", "train", "tvmonitor"}; + const char* about = "This sample uses Single-Shot Detector " - "(https://arxiv.org/abs/1512.02325)" - "to detect objects on image\n"; // TODO: link + "(https://arxiv.org/abs/1512.02325) " + "to detect objects on camera/video/image.\n" + ".caffemodel model's file is available here: " + "https://github.com/weiliu89/caffe/tree/ssd#models\n" + "Default network is 300x300 and 20-classes VOC.\n"; const char* params = "{ help | false | print usage }" "{ proto | | model configuration }" "{ model | | model weights }" - "{ image | | image for detection }" + "{ camera_device | 0 | camera device number}" + "{ video | | video or image for detection}" "{ min_confidence | 0.5 | min confidence }"; int main(int argc, char** argv) @@ -57,7 +68,7 @@ int main(int argc, char** argv) if (parser.get("help")) { - std::cout << about << std::endl; + cout << about << endl; parser.printMessage(); return 0; } @@ -79,58 +90,101 @@ int main(int argc, char** argv) exit(-1); } - cv::Mat frame = cv::imread(parser.get("image"), -1); - - if (frame.channels() == 4) - cvtColor(frame, frame, COLOR_BGRA2BGR); - //! [Prepare blob] - Mat preprocessedFrame = preprocess(frame); - - Mat inputBlob = blobFromImage(preprocessedFrame, 1.0f, Size(), Scalar(), false); //Convert Mat to batch of images - //! [Prepare blob] - - //! [Set input blob] - net.setInput(inputBlob, "data"); //set the network input - //! [Set input blob] - - //! [Make forward pass] - Mat detection = net.forward("detection_out"); //compute output - //! [Make forward pass] - - Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr()); - - float confidenceThreshold = parser.get("min_confidence"); - for(int i = 0; i < detectionMat.rows; i++) + VideoCapture cap; + if (parser.get("video").empty()) { - float confidence = detectionMat.at(i, 2); - - if(confidence > confidenceThreshold) + int cameraDevice = parser.get("camera_device"); + cap = VideoCapture(cameraDevice); + if(!cap.isOpened()) { - size_t objectClass = (size_t)(detectionMat.at(i, 1)); - - float xLeftBottom = detectionMat.at(i, 3) * frame.cols; - float yLeftBottom = detectionMat.at(i, 4) * frame.rows; - float xRightTop = detectionMat.at(i, 5) * frame.cols; - float yRightTop = detectionMat.at(i, 6) * frame.rows; - - std::cout << "Class: " << objectClass << std::endl; - std::cout << "Confidence: " << confidence << std::endl; - - std::cout << " " << xLeftBottom - << " " << yLeftBottom - << " " << xRightTop - << " " << yRightTop << std::endl; - - Rect object((int)xLeftBottom, (int)yLeftBottom, - (int)(xRightTop - xLeftBottom), - (int)(yRightTop - yLeftBottom)); - - rectangle(frame, object, Scalar(0, 255, 0)); + cout << "Couldn't find camera: " << cameraDevice << endl; + return -1; + } + } + else + { + cap.open(parser.get("video")); + if(!cap.isOpened()) + { + cout << "Couldn't open image or video: " << parser.get("video") << endl; + return -1; } } - imshow("detections", frame); - waitKey(); + for (;;) + { + cv::Mat frame; + cap >> frame; // get a new frame from camera/video or read image + + if (frame.empty()) + { + waitKey(); + break; + } + + if (frame.channels() == 4) + cvtColor(frame, frame, COLOR_BGRA2BGR); + + //! [Prepare blob] + Mat preprocessedFrame = preprocess(frame); + + Mat inputBlob = blobFromImage(preprocessedFrame, 1.0f, Size(), Scalar(), false); //Convert Mat to batch of images + //! [Prepare blob] + + //! [Set input blob] + net.setInput(inputBlob, "data"); //set the network input + //! [Set input blob] + + //! [Make forward pass] + Mat detection = net.forward("detection_out"); //compute output + //! [Make forward pass] + + vector layersTimings; + double freq = getTickFrequency() / 1000; + double time = net.getPerfProfile(layersTimings) / freq; + ostringstream ss; + ss << "FPS: " << 1000/time << " ; time: " << time << " ms"; + putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255)); + + Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr()); + + float confidenceThreshold = parser.get("min_confidence"); + for(int i = 0; i < detectionMat.rows; i++) + { + float confidence = detectionMat.at(i, 2); + + if(confidence > confidenceThreshold) + { + size_t objectClass = (size_t)(detectionMat.at(i, 1)); + + int xLeftBottom = static_cast(detectionMat.at(i, 3) * frame.cols); + int yLeftBottom = static_cast(detectionMat.at(i, 4) * frame.rows); + int xRightTop = static_cast(detectionMat.at(i, 5) * frame.cols); + int yRightTop = static_cast(detectionMat.at(i, 6) * frame.rows); + + ss.str(""); + ss << confidence; + String conf(ss.str()); + + Rect object(xLeftBottom, yLeftBottom, + xRightTop - xLeftBottom, + yRightTop - yLeftBottom); + + rectangle(frame, object, Scalar(0, 255, 0)); + String label = String(classNames[objectClass]) + ": " + conf; + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height), + Size(labelSize.width, labelSize.height + baseLine)), + Scalar(255, 255, 255), CV_FILLED); + putText(frame, label, Point(xLeftBottom, yLeftBottom), + FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); + } + } + + imshow("detections", frame); + if (waitKey(1) >= 0) break; + } return 0; } // main diff --git a/samples/dnn/yolo_object_detection.cpp b/samples/dnn/yolo_object_detection.cpp index 0731ad25a8..8302e78311 100644 --- a/samples/dnn/yolo_object_detection.cpp +++ b/samples/dnn/yolo_object_detection.cpp @@ -15,29 +15,36 @@ const size_t network_width = 416; const size_t network_height = 416; const char* about = "This sample uses You only look once (YOLO)-Detector " - "(https://arxiv.org/abs/1612.08242)" - "to detect objects on image\n"; // TODO: link + "(https://arxiv.org/abs/1612.08242) " + "to detect objects on camera/video/image.\n" + "Models can be downloaded here: " + "https://pjreddie.com/darknet/yolo/\n" + "Default network is 416x416.\n" + "Class names can be downloaded here: " + "https://github.com/pjreddie/darknet/tree/master/data\n"; const char* params = "{ help | false | print usage }" "{ cfg | | model configuration }" "{ model | | model weights }" - "{ image | | image for detection }" - "{ min_confidence | 0.24 | min confidence }"; + "{ camera_device | 0 | camera device number}" + "{ video | | video or image for detection}" + "{ min_confidence | 0.24 | min confidence }" + "{ class_names | | class names }"; int main(int argc, char** argv) { - cv::CommandLineParser parser(argc, argv, params); + CommandLineParser parser(argc, argv, params); if (parser.get("help")) { - std::cout << about << std::endl; + cout << about << endl; parser.printMessage(); return 0; } - String modelConfiguration = parser.get("cfg"); - String modelBinary = parser.get("model"); + String modelConfiguration = parser.get("cfg"); + String modelBinary = parser.get("model"); //! [Initialize network] dnn::Net net = readNetFromDarknet(modelConfiguration, modelBinary); @@ -53,65 +60,130 @@ int main(int argc, char** argv) exit(-1); } - cv::Mat frame = cv::imread(parser.get("image")); - - //! [Resizing without keeping aspect ratio] - cv::Mat resized; - cv::resize(frame, resized, cv::Size(network_width, network_height)); - //! [Resizing without keeping aspect ratio] - - //! [Prepare blob] - Mat inputBlob = blobFromImage(resized, 1 / 255.F); //Convert Mat to batch of images - //! [Prepare blob] - - //! [Set input blob] - net.setInput(inputBlob, "data"); //set the network input - //! [Set input blob] - - //! [Make forward pass] - cv::Mat detectionMat = net.forward("detection_out"); //compute output - //! [Make forward pass] - - - float confidenceThreshold = parser.get("min_confidence"); - for (int i = 0; i < detectionMat.rows; i++) + VideoCapture cap; + if (parser.get("video").empty()) { - const int probability_index = 5; - const int probability_size = detectionMat.cols - probability_index; - float *prob_array_ptr = &detectionMat.at(i, probability_index); - - size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; - float confidence = detectionMat.at(i, (int)objectClass + probability_index); - - if (confidence > confidenceThreshold) + int cameraDevice = parser.get("camera_device"); + cap = VideoCapture(cameraDevice); + if(!cap.isOpened()) { - float x = detectionMat.at(i, 0); - float y = detectionMat.at(i, 1); - float width = detectionMat.at(i, 2); - float height = detectionMat.at(i, 3); - float xLeftBottom = (x - width / 2) * frame.cols; - float yLeftBottom = (y - height / 2) * frame.rows; - float xRightTop = (x + width / 2) * frame.cols; - float yRightTop = (y + height / 2) * frame.rows; - - std::cout << "Class: " << objectClass << std::endl; - std::cout << "Confidence: " << confidence << std::endl; - - std::cout << " " << xLeftBottom - << " " << yLeftBottom - << " " << xRightTop - << " " << yRightTop << std::endl; - - Rect object((int)xLeftBottom, (int)yLeftBottom, - (int)(xRightTop - xLeftBottom), - (int)(yRightTop - yLeftBottom)); - - rectangle(frame, object, Scalar(0, 255, 0)); + cout << "Couldn't find camera: " << cameraDevice << endl; + return -1; + } + } + else + { + cap.open(parser.get("video")); + if(!cap.isOpened()) + { + cout << "Couldn't open image or video: " << parser.get("video") << endl; + return -1; } } - imshow("detections", frame); - waitKey(); + vector classNamesVec; + ifstream classNamesFile(parser.get("class_names").c_str()); + if (classNamesFile.is_open()) + { + string className = ""; + while (classNamesFile >> className) + classNamesVec.push_back(className); + } + + for(;;) + { + Mat frame; + cap >> frame; // get a new frame from camera/video or read image + + if (frame.empty()) + { + waitKey(); + break; + } + + if (frame.channels() == 4) + cvtColor(frame, frame, COLOR_BGRA2BGR); + + //! [Resizing without keeping aspect ratio] + Mat resized; + resize(frame, resized, Size(network_width, network_height)); + //! [Resizing without keeping aspect ratio] + + //! [Prepare blob] + Mat inputBlob = blobFromImage(resized, 1 / 255.F); //Convert Mat to batch of images + //! [Prepare blob] + + //! [Set input blob] + net.setInput(inputBlob, "data"); //set the network input + //! [Set input blob] + + //! [Make forward pass] + Mat detectionMat = net.forward("detection_out"); //compute output + //! [Make forward pass] + + vector layersTimings; + double freq = getTickFrequency() / 1000; + double time = net.getPerfProfile(layersTimings) / freq; + ostringstream ss; + ss << "FPS: " << 1000/time << " ; time: " << time << " ms"; + putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255)); + + float confidenceThreshold = parser.get("min_confidence"); + for (int i = 0; i < detectionMat.rows; i++) + { + const int probability_index = 5; + const int probability_size = detectionMat.cols - probability_index; + float *prob_array_ptr = &detectionMat.at(i, probability_index); + + size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; + float confidence = detectionMat.at(i, (int)objectClass + probability_index); + + if (confidence > confidenceThreshold) + { + float x = detectionMat.at(i, 0); + float y = detectionMat.at(i, 1); + float width = detectionMat.at(i, 2); + float height = detectionMat.at(i, 3); + int xLeftBottom = static_cast((x - width / 2) * frame.cols); + int yLeftBottom = static_cast((y - height / 2) * frame.rows); + int xRightTop = static_cast((x + width / 2) * frame.cols); + int yRightTop = static_cast((y + height / 2) * frame.rows); + + Rect object(xLeftBottom, yLeftBottom, + xRightTop - xLeftBottom, + yRightTop - yLeftBottom); + + rectangle(frame, object, Scalar(0, 255, 0)); + + if (objectClass < classNamesVec.size()) + { + ss.str(""); + ss << confidence; + String conf(ss.str()); + String label = String(classNamesVec[objectClass]) + ": " + conf; + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height), + Size(labelSize.width, labelSize.height + baseLine)), + Scalar(255, 255, 255), CV_FILLED); + putText(frame, label, Point(xLeftBottom, yLeftBottom), + FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0)); + } + else + { + cout << "Class: " << objectClass << endl; + cout << "Confidence: " << confidence << endl; + cout << " " << xLeftBottom + << " " << yLeftBottom + << " " << xRightTop + << " " << yRightTop << endl; + } + } + } + + imshow("detections", frame); + if (waitKey(1) >= 0) break; + } return 0; -} // main \ No newline at end of file +} // main