opencv/samples/dnn/classification.cpp
Gursimar Singh 35eba9ca90
Merge pull request #25519 from gursimarsingh:improved_classification_sample
Improved classification sample #25519

#25006 #25314

This pull requests replaces the caffe model for classification with onnx versions. It also adds resnet in model.yml. 

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
2024-08-06 09:16:11 +03:00

264 lines
9.6 KiB
C++

#include <fstream>
#include <sstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include "common.hpp"
using namespace cv;
using namespace std;
using namespace dnn;
const string about =
"Use this script to run a classification model on a camera stream, video, image or image list (i.e. .xml or .yaml containing image lists)\n\n"
"Firstly, download required models using `download_models.py` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to specify where models should be downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.\n"
"To run:\n"
"\t ./example_dnn_classification model_name --input=path/to/your/input/image/or/video (don't give --input flag if want to use device camera)\n"
"Sample command:\n"
"\t ./example_dnn_classification resnet --input=$OPENCV_SAMPLES_DATA_PATH/baboon.jpg\n"
"\t ./example_dnn_classification squeezenet\n"
"Model path can also be specified using --model argument. "
"Use imagelist_creator to create the xml or yaml list\n";
const string param_keys =
"{ help h | | Print help message. }"
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
"{ zoo | ../dnn/models.yml | An optional path to file with preprocessing parameters }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ imglist | | Pass this flag if image list (i.e. .xml or .yaml) file is passed}"
"{ crop | false | Preprocess input image by center cropping.}"
//"{ labels | | Path to the text file with labels for detected objects.}"
"{ model | | Path to the model file.}";
const string backend_keys = format(
"{ backend | default | Choose one of computation backends: "
"default: automatically (by default), "
"openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"opencv: OpenCV implementation, "
"vkcom: VKCOM, "
"cuda: CUDA, "
"webnn: WebNN }");
const string target_keys = format(
"{ target | cpu | Choose one of target computation devices: "
"cpu: CPU target (by default), "
"opencl: OpenCL, "
"opencl_fp16: OpenCL fp16 (half-float precision), "
"vpu: VPU, "
"vulkan: Vulkan, "
"cuda: CUDA, "
"cuda_fp16: CUDA fp16 (half-float preprocess) }");
string keys = param_keys + backend_keys + target_keys;
vector<string> classes;
static bool readStringList( const string& filename, vector<string>& l )
{
l.resize(0);
FileStorage fs(filename, FileStorage::READ);
if( !fs.isOpened() )
return false;
size_t dir_pos = filename.rfind('/');
if (dir_pos == string::npos)
dir_pos = filename.rfind('\\');
FileNode n = fs.getFirstTopLevelNode();
if( n.type() != FileNode::SEQ )
return false;
FileNodeIterator it = n.begin(), it_end = n.end();
for( ; it != it_end; ++it )
{
string fname = (string)*it;
if (dir_pos != string::npos)
{
string fpath = samples::findFile(filename.substr(0, dir_pos + 1) + fname, false);
if (fpath.empty())
{
fpath = samples::findFile(fname);
}
fname = fpath;
}
else
{
fname = samples::findFile(fname);
}
l.push_back(fname);
}
return true;
}
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
if (!parser.has("@alias") || parser.has("help"))
{
cout << about << endl;
parser.printMessage();
return -1;
}
const string modelName = parser.get<String>("@alias");
const string zooFile = findFile(parser.get<String>("zoo"));
keys += genPreprocArguments(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
parser.about(about);
if (argc == 1 || parser.has("help"))
{
parser.printMessage();
return 0;
}
String sha1 = parser.get<String>("sha1");
float scale = parser.get<float>("scale");
Scalar mean = parser.get<Scalar>("mean");
Scalar std = parser.get<Scalar>("std");
bool swapRB = parser.get<bool>("rgb");
bool crop = parser.get<bool>("crop");
int inpWidth = parser.get<int>("width");
int inpHeight = parser.get<int>("height");
String model = findModel(parser.get<String>("model"), sha1);
String backend = parser.get<String>("backend");
String target = parser.get<String>("target");
bool isImgList = parser.has("imglist");
// Open file with labels.
string labels_filename = parser.get<String>("labels");
string file = findFile(labels_filename);
ifstream ifs(file.c_str());
if (!ifs.is_open()){
cout<<"File " << file << " not found";
exit(1);
}
string line;
while (getline(ifs, line))
{
classes.push_back(line);
}
if (!parser.check())
{
parser.printErrors();
return 1;
}
CV_Assert(!model.empty());
//! [Read and initialize network]
Net net = readNetFromONNX(model);
net.setPreferableBackend(getBackendID(backend));
net.setPreferableTarget(getTargetID(target));
//! [Read and initialize network]
// Create a window
static const std::string kWinName = "Deep learning image classification in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);
//Create FontFace for putText
FontFace sans("sans");
//! [Open a video file or an image file or a camera stream]
VideoCapture cap;
vector<string> imageList;
size_t currentImageIndex = 0;
if (parser.has("input")) {
string input = findFile(parser.get<String>("input"));
if (isImgList) {
bool check = readStringList(samples::findFile(input), imageList);
if (imageList.empty() || !check) {
cout << "Error: No images found or the provided file is not a valid .yaml or .xml file." << endl;
return -1;
}
} else {
// Input is not a directory, try to open as video or image
cap.open(input);
if (!cap.isOpened()) {
cout << "Failed to open the input." << endl;
return -1;
}
}
} else {
cap.open(0); // Open default camera
}
//! [Open a video file or an image file or a camera stream]
Mat frame, blob;
for(;;)
{
if (!imageList.empty()) {
// Handling directory of images
if (currentImageIndex >= imageList.size()) {
waitKey();
break; // Exit if all images are processed
}
frame = imread(imageList[currentImageIndex++]);
if(frame.empty()){
cout<<"Cannot open file"<<endl;
continue;
}
} else {
// Handling video or single image
cap >> frame;
}
if (frame.empty())
{
break;
}
//! [Create a 4D blob from a frame]
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
// Check std values.
if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
{
// Divide blob by std.
divide(blob, std, blob);
}
//! [Create a 4D blob from a frame]
//! [Set input blob]
net.setInput(blob);
//! [Set input blob]
TickMeter timeRecorder;
timeRecorder.reset();
Mat prob = net.forward();
double t1;
//! [Make forward pass]
timeRecorder.start();
prob = net.forward();
timeRecorder.stop();
//! [Make forward pass]
//! [Get a class with a highest score]
int N = (int)prob.total(), K = std::min(5, N);
std::vector<std::pair<float, int> > prob_vec;
for (int i = 0; i < N; i++) {
prob_vec.push_back(std::make_pair(-prob.at<float>(i), i));
}
std::sort(prob_vec.begin(), prob_vec.end());
//! [Get a class with a highest score]
t1 = timeRecorder.getTimeMilli();
timeRecorder.reset();
string label = format("Inference time: %.1f ms", t1);
Mat subframe = frame(Rect(0, 0, std::min(1000, frame.cols), std::min(300, frame.rows)));
subframe *= 0.3f;
putText(frame, label, Point(20, 50), Scalar(0, 255, 0), sans, 25, 800);
// Print predicted class.
for (int i = 0; i < K; i++) {
int classId = prob_vec[i].second;
float confidence = -prob_vec[i].first;
label = format("%d. %s: %.2f", i+1, (classes.empty() ? format("Class #%d", classId).c_str() :
classes[classId].c_str()), confidence);
putText(frame, label, Point(20, 110 + i*35), Scalar(0, 255, 0), sans, 25, 500);
}
imshow(kWinName, frame);
int key = waitKey(isImgList ? 1000 : 100);
if (key == ' ')
key = waitKey();
if (key == 'q' || key == 27) // Check if 'q' or 'ESC' is pressed
return 0;
}
waitKey();
return 0;
}