/* This sample detects the query person in the given video file. Authors of samples and Youtu ReID baseline: Xing Sun Feng Zheng Xinyang Jiang Fufu Yu Enwei Zhang Copyright (C) 2020-2021, Tencent. Copyright (C) 2020-2021, SUSTech. Copyright (C) 2024, Bigvision LLC. How to use: sample command to run: ./example_dnn_person_reid The system will ask you to mark the person to be tracked You can download ReID model using: `python download_models.py reid` and yolo model using: `python download_models.py yolov8` Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to point to the directory where models are downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data. */ #include #include #include #include #include #include "common.hpp" using namespace cv; using namespace cv::dnn; using namespace std; const string about = "Use this script for Person Re-identification using OpenCV. \n\n" "Firstly, download required models i.e. reid and yolov8 using `download_models.py` (if not already done). Set environment variable OPENCV_DOWNLOAD_CACHE_DIR to point to the directory where models are downloaded. Also, point OPENCV_SAMPLES_DATA_PATH to opencv/samples/data.\n" "To run:\n" "\t Example: ./example_dnn_person_reid reid\n\n" "Re-Identification model path can also be specified using --model argument. Detection model can be set using --yolo_model argument.\n\n"; const string param_keys = "{help h | | show help message}" "{ @alias | reid | An alias name of model to extract preprocessing parameters from models.yml file. }" "{ zoo | ../dnn/models.yml | An optional path to file with preprocessing parameters }" "{query q | | Path to target image. Skip this argument to select target in the video frame.}" "{input i | | video file path}" "{yolo_model | | Path to yolov8n.onnx}"; const string backend_keys = format( "{ backend | default | Choose one of computation backends: " "default: automatically (by default), " "openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " "opencv: OpenCV implementation, " "vkcom: VKCOM, " "cuda: CUDA, " "webnn: WebNN }"); const string target_keys = format( "{ target | cpu | Choose one of target computation devices: " "cpu: CPU target (by default), " "opencl: OpenCL, " "opencl_fp16: OpenCL fp16 (half-float precision), " "vpu: VPU, " "vulkan: Vulkan, " "cuda: CUDA, " "cuda_fp16: CUDA fp16 (half-float preprocess) }"); string keys = param_keys + backend_keys + target_keys; struct MatComparator { bool operator()(const Mat &a, const Mat &b) const { return a.data < b.data; // This is a simple pointer comparison, not content! } }; map imgDict; int height, width, yoloHeight, yoloWidth; float scale, yoloScale; bool swapRB, yoloSwapRB; Scalar mean_v, stnd; static void extractFeatures(vector &imglist, Net &net, vector &features) { for (size_t st = 0; st < imglist.size(); st++) { Mat blob; blobFromImage(imglist[st], blob, scale, Size(width, height), mean_v, swapRB, false, CV_32F); // Check if standard deviation values are non-zero if (stnd[0] != 0.0 && stnd[1] != 0.0 && stnd[2] != 0.0) { // Divide blob by std for each channel divide(blob, stnd, blob); } net.setInput(blob); Mat out=net.forward(); vector s {out.size[0], out.size[1]}; out = out.reshape(1, s); for (int i = 0; i < out.rows; i++) { Mat norm_features; normalize(out.row(i), norm_features, 1.0, 0.0, NORM_L2); features.push_back(norm_features); } } return; } static int findMatching(const Mat &queryFeatures, const vector &galleryFeatures) { if (queryFeatures.empty() || galleryFeatures.empty()) return -1; // No valid index if either feature list is empty int bestIndex = -1; float maxSimilarity = FLT_MIN; for (int j = 0; j < (int)galleryFeatures.size(); j++) { float currentSimilarity = static_cast(queryFeatures.dot(galleryFeatures[j])); if (currentSimilarity > maxSimilarity) { maxSimilarity = currentSimilarity; bestIndex = j; } } return bestIndex; } static void yoloDetector(Mat &frame, Net &net, vector& images) { int ht = frame.rows; int wt = frame.cols; int length = max(ht, wt); Mat image = Mat::zeros(Size(length, length), frame.type()); frame.copyTo(image(Rect(0, 0, wt, ht))); // Calculate the scale double norm_scale = static_cast(length) / yoloWidth; Mat blob; blobFromImage(image, blob, yoloScale, Size(yoloWidth, yoloHeight), Scalar(), yoloSwapRB, false, CV_32F); net.setInput(blob); vector outputs; net.forward(outputs); Mat reshapedMatrix = outputs[0].reshape(0, 84); // Reshape to 2D (84 rows, 8400 columns) Mat outputTransposed; transpose(reshapedMatrix, outputTransposed); int rows = outputTransposed.rows; vector boxes; vector scores; vector class_ids; for (int i = 0; i < rows; i++) { double minScore, maxScore; Point minClassLoc, maxClassLoc; minMaxLoc(outputTransposed.row(i).colRange(4, outputTransposed.cols), &minScore, &maxScore, &minClassLoc, &maxClassLoc); if (maxScore >= 0.25 && maxClassLoc.x == 0) { double centerX = outputTransposed.at(i, 0); double centerY = outputTransposed.at(i, 1); double w = outputTransposed.at(i, 2); double h = outputTransposed.at(i, 3); Rect2d box( centerX - 0.5 * w, // x centerY - 0.5 * h, // y w, // width h // height ); boxes.push_back(box); scores.push_back(static_cast(maxScore)); class_ids.push_back(maxClassLoc.x); // x location gives the index } } // Apply Non-Maximum Suppression vector indexes; NMSBoxes(boxes, scores, 0.25f, 0.45f, indexes, 0.5f, 0); images.resize(indexes.size()); for (size_t i = 0; i < indexes.size(); i++) { int index = indexes[i]; int x = static_cast(round(boxes[index].x * norm_scale)); int y = static_cast(round(boxes[index].y * norm_scale)); int w = static_cast(round(boxes[index].width * norm_scale)); int h = static_cast(round(boxes[index].height * norm_scale)); // Make sure the box is within the frame x = max(0, x); y = max(0, y); w = min(w, frame.cols - x); h = min(h, frame.rows - y); // Crop the image Rect roi(x, y, w, h); // Define a region of interest images[i] = frame(roi); // Crop the region from the frame imgDict[images[i]] = roi; } return; } int main(int argc, char **argv) { CommandLineParser parser(argc, argv, keys); if (!parser.has("@alias") || parser.has("help")) { cout<("@alias"); string zooFile = findFile(parser.get("zoo")); keys += genPreprocArguments(modelName, zooFile); keys += genPreprocArguments(modelName, zooFile, "yolo_"); parser = CommandLineParser(argc, argv, keys); parser.about("Use this script to run ReID networks using OpenCV."); const string sha1 = parser.get("sha1"); const string yoloSha1 = parser.get("yolo_sha1"); const string modelPath = findModel(parser.get("model"), sha1); const string queryImagePath = parser.get("query"); string videoPath = parser.get("input"); const string yoloPath = findModel(parser.get("yolo_model"), yoloSha1); const string backend = parser.get("backend"); const string target = parser.get("target"); height = parser.get("height"); width = parser.get("width"); yoloHeight = parser.get("yolo_height"); yoloWidth = parser.get("yolo_width"); scale = parser.get("scale"); yoloScale = parser.get("yolo_scale"); swapRB = parser.get("rgb"); yoloSwapRB = parser.get("yolo_rgb"); mean_v = parser.get("mean"); stnd = parser.get("std"); int stdSize = 20; int stdWeight = 400; int stdImgSize = 512; int imgWidth = -1; // Initialization int fontSize = 50; int fontWeight = 500; EngineType engine = ENGINE_AUTO; if (backend != "default" || target != "cpu"){ engine = ENGINE_CLASSIC; } Net reidNet = readNetFromONNX(modelPath, engine); reidNet.setPreferableBackend(getBackendID(backend)); reidNet.setPreferableTarget(getTargetID(target)); if(yoloPath.empty()){ cout<<"[ERROR] Please pass path to yolov8.onnx model file using --yolo_model."< queryImages; Mat queryImg; if (!queryImagePath.empty()) { queryImg = imread(queryImagePath); if (queryImg.empty()) { cerr << "Error: Query image could not be loaded." << endl; return -1; } queryImages.push_back(queryImg); } else { Mat image; for(;;) { cap.read(image); if (image.empty()) { cerr << "Error reading the video" << endl; return -1; } if (imgWidth == -1){ imgWidth = min(image.rows, image.cols); fontSize = min(fontSize, (stdSize*imgWidth)/stdImgSize); fontWeight = min(fontWeight, (stdWeight*imgWidth)/stdImgSize); } const string label = "Press space bar to pause video to draw bounding box."; Rect r = getTextSize(Size(), label, Point(), fontFace, fontSize, fontWeight); r.height += 2 * fontSize; // padding r.width += 10; // padding rectangle(image, r, Scalar::all(255), FILLED); putText(image, label, Point(10, fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight); putText(image, "Press space bar after selecting.", Point(10, 2*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight); imshow("TRACKING", image); int key = waitKey(200); if(key == ' '){ Rect rect = selectROI("TRACKING", image); if (rect.width > 0 && rect.height > 0) { queryImg = image(rect).clone(); queryImages.push_back(queryImg); break; } } if (key == 'q' || key == 27) { return 0; } } } Mat frame; vector queryFeatures; extractFeatures(queryImages, reidNet, queryFeatures); vector detectedImages; vector galleryFeatures; for(;;) { if (!cap.read(frame) || frame.empty()) { break; } if (imgWidth == -1){ imgWidth = min(frame.rows, frame.cols); fontSize = min(fontSize, (stdSize*imgWidth)/stdImgSize); fontWeight = min(fontWeight, (stdWeight*imgWidth)/stdImgSize); } detectedImages.clear(); galleryFeatures.clear(); yoloDetector(frame, net, detectedImages); extractFeatures(detectedImages, reidNet, galleryFeatures); int match_idx = findMatching(queryFeatures[0], galleryFeatures); if (match_idx != -1 && static_cast(detectedImages.size()) > match_idx) { Mat matchImg = detectedImages[match_idx]; Rect bbox = imgDict[matchImg]; rectangle(frame, bbox, Scalar(0, 0, 255), 2); putText(frame, "Target", Point(bbox.x, bbox.y - 10), Scalar(0,0,255), fontFace, fontSize, fontWeight); } const string label = "Tracking"; Rect r = getTextSize(Size(), label, Point(), fontFace, fontSize, fontWeight); r.height += fontSize; // padding r.width += 10; // padding rectangle(frame, r, Scalar::all(255), FILLED); putText(frame, label, Point(10, fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight); imshow("TRACKING", frame); int key = waitKey(30); if (key == 'q' || key == 27) { break; } } cap.release(); destroyAllWindows(); return 0; }