Merge pull request #26202 from gursimarsingh:improved_tracker_samples

Improved Tracker Samples #26202

Relates to #25006

This sample has been rewritten to track a selected target in a video or camera stream. It combines VIT tracker, Nano tracker and Dasiamrpn tracker into one tracker sample

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Gursimar Singh 2024-12-05 14:20:03 +05:30 committed by GitHub
parent 0c774c94f9
commit 816851c999
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 591 additions and 768 deletions

View File

@ -769,6 +769,11 @@ public:
*/
CV_WRAP virtual
bool update(InputArray image, CV_OUT Rect& boundingBox) = 0;
/** @brief Return tracking score
*/
CV_WRAP virtual float getTrackingScore() { return -1; }
};
@ -834,10 +839,6 @@ public:
static CV_WRAP
Ptr<TrackerDaSiamRPN> create(const TrackerDaSiamRPN::Params& parameters = TrackerDaSiamRPN::Params());
/** @brief Return tracking score
*/
CV_WRAP virtual float getTrackingScore() = 0;
//void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
//bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
};
@ -872,10 +873,6 @@ public:
static CV_WRAP
Ptr<TrackerNano> create(const TrackerNano::Params& parameters = TrackerNano::Params());
/** @brief Return tracking score
*/
CV_WRAP virtual float getTrackingScore() = 0;
//void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
//bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
};
@ -910,10 +907,6 @@ public:
static CV_WRAP
Ptr<TrackerVit> create(const TrackerVit::Params& parameters = TrackerVit::Params());
/** @brief Return tracking score
*/
CV_WRAP virtual float getTrackingScore() = 0;
// void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
// bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
};

View File

@ -89,8 +89,12 @@ public:
TrackerNanoImpl(const TrackerNano::Params& parameters)
: params(parameters)
{
backbone = dnn::readNet(params.backbone);
neckhead = dnn::readNet(params.neckhead);
dnn::EngineType engine = dnn::ENGINE_AUTO;
if (params.backend != 0 || params.target != 0){
engine = dnn::ENGINE_CLASSIC;
}
backbone = dnn::readNet(params.backbone, "", "", engine);
neckhead = dnn::readNet(params.neckhead, "", "", engine);
CV_Assert(!backbone.empty());
CV_Assert(!neckhead.empty());

View File

@ -44,7 +44,11 @@ public:
TrackerVitImpl(const TrackerVit::Params& parameters)
: params(parameters)
{
net = dnn::readNet(params.net);
dnn::EngineType engine = dnn::ENGINE_AUTO;
if (params.backend != 0 || params.target != 0){
engine = dnn::ENGINE_CLASSIC;
}
net = dnn::readNet(params.net, "", "", engine);
CV_Assert(!net.empty());
net.setPreferableBackend(params.backend);

View File

@ -1,190 +0,0 @@
// DaSiamRPN tracker.
// Original paper: https://arxiv.org/abs/1808.06048
// Link to original repo: https://github.com/foolwood/DaSiamRPN
// Links to onnx models:
// - network: https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
// - kernel_r1: https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
// - kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
#include <iostream>
#include <cmath>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/video.hpp>
using namespace cv;
using namespace cv::dnn;
std::string param_keys =
"{ help h | | Print help message }"
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
"{ net | dasiamrpn_model.onnx | Path to onnx model of net}"
"{ kernel_cls1 | dasiamrpn_kernel_cls1.onnx | Path to onnx model of kernel_r1 }"
"{ kernel_r1 | dasiamrpn_kernel_r1.onnx | Path to onnx model of kernel_cls1 }";
std::string backend_keys = cv::format(
"{ backend | 0 | Choose one of computation backends: "
"%d: automatically (by default), "
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"%d: OpenCV implementation, "
"%d: VKCOM, "
"%d: CUDA }", cv::dnn::DNN_BACKEND_DEFAULT, cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_BACKEND_VKCOM, cv::dnn::DNN_BACKEND_CUDA);
std::string target_keys = cv::format(
"{ target | 0 | Choose one of target computation devices: "
"%d: CPU target (by default), "
"%d: OpenCL, "
"%d: OpenCL fp16 (half-float precision), "
"%d: VPU, "
"%d: Vulkan, "
"%d: CUDA, "
"%d: CUDA fp16 (half-float preprocess) }", cv::dnn::DNN_TARGET_CPU, cv::dnn::DNN_TARGET_OPENCL, cv::dnn::DNN_TARGET_OPENCL_FP16, cv::dnn::DNN_TARGET_MYRIAD, cv::dnn::DNN_TARGET_VULKAN, cv::dnn::DNN_TARGET_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16);
std::string keys = param_keys + backend_keys + target_keys;
static
int run(int argc, char** argv)
{
// Parse command line arguments.
CommandLineParser parser(argc, argv, keys);
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
std::string inputName = parser.get<String>("input");
std::string net = parser.get<String>("net");
std::string kernel_cls1 = parser.get<String>("kernel_cls1");
std::string kernel_r1 = parser.get<String>("kernel_r1");
int backend = parser.get<int>("backend");
int target = parser.get<int>("target");
Ptr<TrackerDaSiamRPN> tracker;
try
{
TrackerDaSiamRPN::Params params;
params.model = samples::findFile(net);
params.kernel_cls1 = samples::findFile(kernel_cls1);
params.kernel_r1 = samples::findFile(kernel_r1);
params.backend = backend;
params.target = target;
tracker = TrackerDaSiamRPN::create(params);
}
catch (const cv::Exception& ee)
{
std::cerr << "Exception: " << ee.what() << std::endl;
std::cout << "Can't load the network by using the following files:" << std::endl;
std::cout << "siamRPN : " << net << std::endl;
std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
return 2;
}
const std::string winName = "DaSiamRPN";
namedWindow(winName, WINDOW_AUTOSIZE);
// Open a video file or an image file or a camera stream.
VideoCapture cap;
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
{
int c = inputName.empty() ? 0 : inputName[0] - '0';
std::cout << "Trying to open camera #" << c << " ..." << std::endl;
if (!cap.open(c))
{
std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
return 2;
}
}
else if (inputName.size())
{
inputName = samples::findFileOrKeep(inputName);
if (!cap.open(inputName))
{
std::cout << "Could not open: " << inputName << std::endl;
return 2;
}
}
// Read the first image.
Mat image;
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame!" << std::endl;
return 2;
}
Mat image_select = image.clone();
putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
Rect selectRect = selectROI(winName, image_select);
std::cout << "ROI=" << selectRect << std::endl;
tracker->init(image, selectRect);
TickMeter tickMeter;
for (int count = 0; ; ++count)
{
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
break;
}
Rect rect;
tickMeter.start();
bool ok = tracker->update(image, rect);
tickMeter.stop();
float score = tracker->getTrackingScore();
std::cout << "frame " << count <<
": predicted score=" << score <<
" rect=" << rect <<
" time=" << tickMeter.getTimeMilli() << "ms" <<
std::endl;
Mat render_image = image.clone();
if (ok)
{
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
std::string scoreLabel = format("Score: %f", score);
putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
}
imshow(winName, render_image);
tickMeter.reset();
int c = waitKey(1);
if (c == 27 /*ESC*/)
break;
}
std::cout << "Exit" << std::endl;
return 0;
}
int main(int argc, char **argv)
{
try
{
return run(argc, argv);
}
catch (const std::exception& e)
{
std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
return 1;
}
}

View File

@ -326,32 +326,37 @@ def parseMetalinkFile(metalink_filepath, save_dir):
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir))
return models
def parseYAMLFile(yaml_filepath, save_dir):
def parseYAMLFile(yaml_filepath, save_dir, model_name):
models = []
with open(yaml_filepath, 'r') as stream:
data_loaded = yaml.safe_load(stream)
for name, params in data_loaded.items():
load_info = params.get("load_info", None)
if load_info:
fname = os.path.basename(params.get("model"))
hash_sum = load_info.get("sha1")
url = load_info.get("url")
download_sha = load_info.get("download_sha")
download_name = load_info.get("download_name")
archive_member = load_info.get("member")
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
config_load_info = params.get("config_load_info", None)
if config_load_info:
fname = os.path.basename(params.get("config"))
hash_sum = config_load_info.get("sha1")
url = config_load_info.get("url")
download_sha = config_load_info.get("download_sha")
download_name = config_load_info.get("download_name")
archive_member = config_load_info.get("member")
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
if model_name != "" and name != model_name:
continue
for key in params.keys():
if key.endswith("load_info"):
prefix = key[:-len('load_info')]
load_info = params.get(prefix+"load_info", None)
if load_info:
print(prefix)
if prefix == "config_":
fname = os.path.basename(params.get("config"))
hash_sum = load_info.get("sha1")
url = load_info.get("url")
download_sha = load_info.get("download_sha")
download_name = load_info.get("download_name")
archive_member = load_info.get("member")
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
else:
fname = os.path.basename(params.get(prefix+"model"))
hash_sum = load_info.get(prefix+"sha1")
url = load_info.get(prefix+"url")
download_sha = load_info.get(prefix+"download_sha")
download_name = load_info.get(prefix+"download_name")
archive_member = load_info.get(prefix+"member")
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
return models
@ -367,7 +372,7 @@ if __name__ == '__main__':
save_dir = args.save_dir
selected_model_name = args.model_name
models.extend(parseMetalinkFile('face_detector/weights.meta4', save_dir))
models.extend(parseYAMLFile('models.yml', save_dir))
models.extend(parseYAMLFile('models.yml', save_dir, selected_model_name))
for m in models:
print(m)
if selected_model_name and not m.name.startswith(selected_model_name):

View File

@ -390,6 +390,7 @@ reid:
sha1: "d4316b100db40f8840aa82626e1cf3f519a7f1ae"
model: "person_reid_youtu_2021nov.onnx"
yolo_load_info:
yolo_url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n.onnx"
yolo_sha1: "68f864475d06e2ec4037181052739f268eeac38d"
yolo_model: "yolov8n.onnx"
mean: [0.485, 0.456, 0.406]
@ -403,3 +404,40 @@ reid:
rgb: false
yolo_rgb: true
sample: "person_reid"
################################################################################
# Tracker models.
################################################################################
vit:
load_info:
url: "https://github.com/opencv/opencv_zoo/raw/fef72f8fa7c52eaf116d3df358d24e6e959ada0e/models/object_tracking_vittrack/object_tracking_vittrack_2023sep.onnx"
sha1: "50008bb4f6a27b1aa940ad886b1bd1936ac4ed3e"
model: "object_tracking_vittrack_2023sep.onnx"
sample: "object_tracker"
nanotrack:
nanotrack_head_load_info:
nanotrack_head_url: "https://github.com/HonglinChu/SiamTrackers/raw/refs/heads/master/NanoTrack/models/nanotrackv2/nanotrack_head_sim.onnx"
nanotrack_head_sha1: "39f168489671700cf739e402dfc67d41ce648aef"
nanotrack_head_model: "nanotrack_head_sim.onnx"
nanotrack_back_load_info:
nanotrack_back_url: "https://github.com/HonglinChu/SiamTrackers/raw/refs/heads/master/NanoTrack/models/nanotrackv2/nanotrack_backbone_sim.onnx"
nanotrack_back_sha1: "6e773a364457b78574f9f63a23b0659ee8646f8f"
nanotrack_back_model: "nanotrack_backbone_sim.onnx"
sample: "object_tracker"
dasiamrpn:
dasiamrpn_load_info:
dasiamrpn_url: "https://github.com/opencv/opencv_zoo/raw/fef72f8fa7c52eaf116d3df358d24e6e959ada0e/models/object_tracking_dasiamrpn/object_tracking_dasiamrpn_model_2021nov.onnx?download="
dasiamrpn_sha1: "91b774fce7df4c0e4918469f0f482d9a27d0e2d4"
dasiamrpn_model: "object_tracking_dasiamrpn_model_2021nov.onnx"
dasiamrpn_kernel_r1_load_info:
dasiamrpn_kernel_r1_url: "https://github.com/opencv/opencv_zoo/raw/fef72f8fa7c52eaf116d3df358d24e6e959ada0e/models/object_tracking_dasiamrpn/object_tracking_dasiamrpn_kernel_r1_2021nov.onnx?download="
dasiamrpn_kernel_r1_sha1: "bb64620a54348657133eb28be2d3a2a8c76b84b3"
dasiamrpn_kernel_r1_model: "object_tracking_dasiamrpn_kernel_cls1_2021nov.onnx"
dasiamrpn_kernel_cls_load_info:
dasiamrpn_kernel_cls_url: "https://github.com/opencv/opencv_zoo/raw/fef72f8fa7c52eaf116d3df358d24e6e959ada0e/models/object_tracking_dasiamrpn/object_tracking_dasiamrpn_kernel_cls1_2021nov.onnx?download="
dasiamrpn_kernel_cls_sha1: "e9ccd270ce8059bdf7ed0d1845c03ef4a951ee0f"
dasiamrpn_kernel_cls_model: "object_tracking_dasiamrpn_kernel_cls1_2021nov.onnx"
sample: "object_tracker"

View File

@ -1,184 +0,0 @@
// NanoTrack
// Link to original inference code: https://github.com/HonglinChu/NanoTrack
// Link to original training repo: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack
// backBone model: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/onnx/nanotrack_backbone_sim.onnx
// headNeck model: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/onnx/nanotrack_head_sim.onnx
#include <iostream>
#include <cmath>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/video.hpp>
using namespace cv;
using namespace cv::dnn;
std::string param_keys =
"{ help h | | Print help message }"
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
"{ backbone | backbone.onnx | Path to onnx model of backbone.onnx}"
"{ headneck | headneck.onnx | Path to onnx model of headneck.onnx }";
std::string backend_keys = cv::format(
"{ backend | 0 | Choose one of computation backends: "
"%d: automatically (by default), "
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"%d: OpenCV implementation, "
"%d: VKCOM, "
"%d: CUDA }", cv::dnn::DNN_BACKEND_DEFAULT, cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_BACKEND_VKCOM, cv::dnn::DNN_BACKEND_CUDA);
std::string target_keys = cv::format(
"{ target | 0 | Choose one of target computation devices: "
"%d: CPU target (by default), "
"%d: OpenCL, "
"%d: OpenCL fp16 (half-float precision), "
"%d: VPU, "
"%d: Vulkan, "
"%d: CUDA, "
"%d: CUDA fp16 (half-float preprocess) }", cv::dnn::DNN_TARGET_CPU, cv::dnn::DNN_TARGET_OPENCL, cv::dnn::DNN_TARGET_OPENCL_FP16, cv::dnn::DNN_TARGET_MYRIAD, cv::dnn::DNN_TARGET_VULKAN, cv::dnn::DNN_TARGET_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16);
std::string keys = param_keys + backend_keys + target_keys;
static
int run(int argc, char** argv)
{
// Parse command line arguments.
CommandLineParser parser(argc, argv, keys);
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
std::string inputName = parser.get<String>("input");
std::string backbone = parser.get<String>("backbone");
std::string headneck = parser.get<String>("headneck");
int backend = parser.get<int>("backend");
int target = parser.get<int>("target");
Ptr<TrackerNano> tracker;
try
{
TrackerNano::Params params;
params.backbone = samples::findFile(backbone);
params.neckhead = samples::findFile(headneck);
params.backend = backend;
params.target = target;
tracker = TrackerNano::create(params);
}
catch (const cv::Exception& ee)
{
std::cerr << "Exception: " << ee.what() << std::endl;
std::cout << "Can't load the network by using the following files:" << std::endl;
std::cout << "backbone : " << backbone << std::endl;
std::cout << "headneck : " << headneck << std::endl;
return 2;
}
const std::string winName = "NanoTrack";
namedWindow(winName, WINDOW_AUTOSIZE);
// Open a video file or an image file or a camera stream.
VideoCapture cap;
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
{
int c = inputName.empty() ? 0 : inputName[0] - '0';
std::cout << "Trying to open camera #" << c << " ..." << std::endl;
if (!cap.open(c))
{
std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
return 2;
}
}
else if (inputName.size())
{
inputName = samples::findFileOrKeep(inputName);
if (!cap.open(inputName))
{
std::cout << "Could not open: " << inputName << std::endl;
return 2;
}
}
// Read the first image.
Mat image;
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame!" << std::endl;
return 2;
}
Mat image_select = image.clone();
putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
Rect selectRect = selectROI(winName, image_select);
std::cout << "ROI=" << selectRect << std::endl;
tracker->init(image, selectRect);
TickMeter tickMeter;
for (int count = 0; ; ++count)
{
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
break;
}
Rect rect;
tickMeter.start();
bool ok = tracker->update(image, rect);
tickMeter.stop();
float score = tracker->getTrackingScore();
std::cout << "frame " << count <<
": predicted score=" << score <<
" rect=" << rect <<
" time=" << tickMeter.getTimeMilli() << "ms" <<
std::endl;
Mat render_image = image.clone();
if (ok)
{
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
std::string scoreLabel = format("Score: %f", score);
putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
}
imshow(winName, render_image);
tickMeter.reset();
int c = waitKey(1);
if (c == 27 /*ESC*/)
break;
}
std::cout << "Exit" << std::endl;
return 0;
}
int main(int argc, char **argv)
{
try
{
return run(argc, argv);
}
catch (const std::exception& e)
{
std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
return 1;
}
}

View File

@ -0,0 +1,309 @@
#include <iostream>
#include <cmath>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/video.hpp>
#include "common.hpp"
using namespace cv;
using namespace std;
using namespace cv::dnn;
const string about = "Use this script for testing Object Tracking using OpenCV. \n\n"
"Firstly, download required models using the download_models.py <alias>.\n"
"Valid alias names are nanotrack, vit and dasiamrpn.\n\n"
"To run:\n"
"\t nanotrack: \n"
"\t\t e.g: ./example_dnn_object_tracker nanotrack\n\n"
"\t vit: \n"
"\t\t e.g: ./example_dnn_object_tracker vit\n\n"
"\t dasiamrpn: \n"
"\t\t e.g: ./example_dnn_object_tracker dasiamrpn\n\n"
"To switch between models in runtime, make sure all the models are downloaded using download_models.py\n";
const string param_keys =
"{ help h | | Print help message }"
"{ @alias | vit | An alias name of model to extract preprocessing parameters from models.yml file. }"
"{ zoo | ../dnn/models.yml | An optional path to file with preprocessing parameters }"
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
"{ tracking_thrs | 0.3 | Tracking score threshold. If a bbox of score >= 0.3, it is considered as found }";
const string backend_keys = format(
"{ backend | default | Choose one of computation backends: "
"default: automatically (by default), "
"openvino: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"opencv: OpenCV implementation, "
"vkcom: VKCOM, "
"cuda: CUDA, "
"webnn: WebNN }");
const string target_keys = format(
"{ target | cpu | Choose one of target computation devices: "
"cpu: CPU target (by default), "
"opencl: OpenCL, "
"opencl_fp16: OpenCL fp16 (half-float precision), "
"vpu: VPU, "
"vulkan: Vulkan, "
"cuda: CUDA, "
"cuda_fp16: CUDA fp16 (half-float preprocess) }");
string keys = param_keys + backend_keys + target_keys;
static void loadParser(const string &modelName, const string &zooFile)
{
// Load appropriate preprocessing arguments based on model name
if (modelName == "vit")
{
keys += genPreprocArguments(modelName, zooFile, "");
}
else if (modelName == "nanotrack")
{
keys += genPreprocArguments(modelName, zooFile, "nanotrack_head_");
keys += genPreprocArguments(modelName, zooFile, "nanotrack_back_");
}
else if (modelName == "dasiamrpn")
{
keys += genPreprocArguments(modelName, zooFile, "dasiamrpn_");
keys += genPreprocArguments(modelName, zooFile, "dasiamrpn_kernel_r1_");
keys += genPreprocArguments(modelName, zooFile, "dasiamrpn_kernel_cls_");
}
return;
}
static void createTracker(const string &modelName, CommandLineParser &parser, Ptr<Tracker> &tracker) {
int backend = getBackendID(parser.get<String>("backend"));
int target = getTargetID(parser.get<String>("target"));
if (modelName == "dasiamrpn") {
const string net = parser.get<String>("dasiamrpn_model");
const string sha1 = parser.get<String>("dasiamrpn_sha1");
const string kernel_cls1 = parser.get<String>("dasiamrpn_kernel_cls_model");
const string kernel_cls_sha1 = parser.get<String>("dasiamrpn_kernel_cls_sha1");
const string kernel_r1 = parser.get<String>("dasiamrpn_kernel_r1_model");
const string kernel_sha1 = parser.get<String>("dasiamrpn_kernel_r1_sha1");
TrackerDaSiamRPN::Params params;
params.model = findModel(net, sha1);
params.kernel_cls1 = findModel(kernel_cls1, kernel_cls_sha1);
params.kernel_r1 = findModel(kernel_r1, kernel_sha1);
params.backend = backend;
params.target = target;
tracker = TrackerDaSiamRPN::create(params);
} else if (modelName == "nanotrack") {
const string backbone = parser.get<String>("nanotrack_back_model");
const string backSha1 = parser.get<String>("nanotrack_back_sha1");
const string headneck = parser.get<String>("nanotrack_head_model");
const string headSha1 = parser.get<String>("nanotrack_head_sha1");
TrackerNano::Params params;
params.backbone = findModel(backbone, backSha1);
params.neckhead = findModel(headneck, headSha1);
params.backend = backend;
params.target = target;
tracker = TrackerNano::create(params);
} else if (modelName == "vit") {
const string net = parser.get<String>("model");
const string sha1 = parser.get<String>("sha1");
float tracking_score_threshold = parser.get<float>("tracking_thrs");
TrackerVit::Params params;
params.net = findModel(net, sha1);
params.backend = backend;
params.target = target;
params.tracking_score_threshold = tracking_score_threshold;
tracker = TrackerVit::create(params);
} else {
cout<<"Pass the valid alias. Choices are {vit, nanotrack, dasiamrpn }."<<endl;
exit(0);
}
return;
}
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
parser.about(about);
if (!parser.has("@alias") || parser.has("help"))
{
parser.printMessage();
return 0;
}
string modelName = parser.get<String>("@alias");
const string zooFile = findFile(parser.get<String>("zoo"));
loadParser(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
Ptr<Tracker> tracker;
createTracker(modelName, parser, tracker);
const string windowName = "TRACKING";
namedWindow(windowName, WINDOW_NORMAL);
FontFace fontFace("sans");
int stdSize = 20;
int stdWeight = 400;
int stdImgSize = 512;
int imgWidth = -1;
int fontSize = 50;
int fontWeight = 500;
double alpha = 0.4;
Rect selectRect;
string inputName = parser.get<String>("input");
string instructionLabel = "Press space bar to pause video to draw bounding box.";
Rect banner;
// Open a video file or an image file or a camera stream.
VideoCapture cap;
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
{
int c = inputName.empty() ? 0 : inputName[0] - '0';
cout << "Trying to open camera #" << c << " ..." << endl;
if (!cap.open(c))
{
cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << endl;
return 0;
}
}
else if (inputName.size())
{
string filePath = findFile(inputName);
if (!cap.open(filePath))
{
cout << "Could not open: " << inputName << endl;
return 0;
}
}
Mat image;
for (;;)
{
cap >> image;
if (image.empty())
{
cerr << "Can't capture frame. End of video stream?" << endl;
return 0;
}
else if (imgWidth == -1){
imgWidth = min(image.rows, image.cols);
fontSize = (stdSize*imgWidth)/stdImgSize;
fontWeight = (stdWeight*imgWidth)/stdImgSize;
banner = getTextSize(Size(), instructionLabel, Point(), fontFace, fontSize, fontWeight);
banner.height += 2 * fontSize; // padding
banner.width += 10; // padding
}
Mat org_img = image.clone();
rectangle(image, banner, Scalar::all(255), FILLED);
addWeighted(image, alpha, org_img, 1 - alpha, 0, image);
putText(image, instructionLabel, Point(10, fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
putText(image, "Press space bar after selecting.", Point(10, 2*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
imshow(windowName, image);
int key = waitKey(30); //Simulating 30 FPS, if reduced frames move really fast
if (key == ' ')
{
selectRect = selectROI(windowName, image);
if (selectRect.width > 0 && selectRect.height > 0)
{
break;
}
else
{
cout << "No valid selection made. Please select again." << endl;
}
}
else if (key == 27) // ESC key to exit
{
exit(0);
}
}
cout << "ROI=" << selectRect << endl;
tracker->init(image, selectRect);
instructionLabel = "Press space bar to select new target";
banner = getTextSize(Size(), instructionLabel, Point(), fontFace, fontSize, fontWeight);
banner.height += 4 * fontSize; // padding
banner.width += 10; // padding
TickMeter tickMeter;
for (int count = 0; ; ++count)
{
cap >> image;
if (image.empty())
{
cerr << "Can't capture frame " << count << ". End of video stream?" << endl;
break;
}
Rect rect;
tickMeter.start();
bool ok = tracker->update(image, rect);
tickMeter.stop();
float score = tracker->getTrackingScore();
Mat render_image = image.clone();
int key = waitKey(30); //Simulating 30 FPS, if reduced frames move really fast
int h = image.rows;
int w = image.cols;
rectangle(render_image, banner, Scalar::all(255), FILLED);
rectangle(render_image, cv::Point(0, int(h - int(1.5*fontSize))), cv::Point(w, h), Scalar::all(255), FILLED);
addWeighted(render_image, alpha, image, 1 - alpha, 0, render_image);
putText(render_image, instructionLabel, Point(10, fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
putText(render_image, "For switching between trackers: press 'v' for ViT, 'n' for Nano, and 'd' for DaSiamRPN.", Point(10, h-10), Scalar(0,0,0), fontFace, int(0.8*fontSize), fontWeight);
if (ok){
if (key == ' '){
putText(render_image, "Select the new target", Point(10, 2*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
selectRect = selectROI(windowName, render_image);
if (selectRect.width > 0 && selectRect.height > 0){
tracker->init(image, selectRect);
}
else{
cout<<"New target is not selected, switching to previous target"<<endl;
}
}
else if (key == 'v'){
modelName = "vit";
loadParser(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
createTracker(modelName, parser, tracker);
tracker->init(image, rect);
}
else if (key == 'n'){
modelName = "nanotrack";
loadParser(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
createTracker(modelName, parser, tracker);
tracker->init(image, rect);
}
else if (key == 'd'){
modelName = "dasiamrpn";
loadParser(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
createTracker(modelName, parser, tracker);
tracker->init(image, rect);
}
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
}
string timeLabel = format("FPS: %.2f", tickMeter.getFPS());
string scoreLabel = format("Score: %f", score);
string algoLabel = "Algorithm: " + modelName;
putText(render_image, timeLabel, Point(10, 2*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
putText(render_image, scoreLabel, Point(10, 3*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
putText(render_image, algoLabel, Point(10, 4*fontSize), Scalar(0,0,0), fontFace, fontSize, fontWeight);
imshow(windowName, render_image);
tickMeter.reset();
if (key == 27 /*ESC*/)
exit(0);
}
return 0;
}

View File

@ -0,0 +1,200 @@
#!/usr/bin/env python
import sys
import cv2 as cv
import argparse
from common import *
def help():
print(
'''
Use this script for testing Object Tracking using OpenCV.
Firstly, download required models using the download_models.py.
To run:
nanotrack:
Download Model: python download_models.py nanotrack
Example: python object_tracker.py nanotrack
vit:
Download Model: python download_models.py vit
Example: python object_tracker.py vit
or
python object_tracker.py
dasiamrpn:
Download Model: python download_models.py dasiamrpn
Example: python object_tracker.py dasiamrpn
To switch between models in runtime, make sure all the models are downloaded using download_models.py'''
)
def load_parser(model_name):
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
help='An optional path to file with preprocessing parameters.')
parser.add_argument("--input", type=str, help="Path to video source")
args, _ = parser.parse_known_args()
add_preproc_args(args.zoo, parser, 'object_tracker', alias=model_name)
if model_name == "dasiamrpn":
add_preproc_args(args.zoo, parser, 'object_tracker', prefix="dasiamrpn_", alias="dasiamrpn")
add_preproc_args(args.zoo, parser, 'object_tracker', prefix="dasiamrpn_kernel_r1_", alias="dasiamrpn")
add_preproc_args(args.zoo, parser, 'object_tracker', prefix="dasiamrpn_kernel_cls_", alias="dasiamrpn")
elif model_name == "nanotrack":
add_preproc_args(args.zoo, parser, 'object_tracker', prefix="nanotrack_back_", alias="nanotrack")
add_preproc_args(args.zoo, parser, 'object_tracker', prefix="nanotrack_head_", alias="nanotrack")
elif model_name != "vit":
print("Pass the valid alias. Choices are { nanotrack, vit, dasiamrpn }")
exit(0)
parser = argparse.ArgumentParser(parents=[parser],
description='''
Firstly, download required models using `python download_models.py {modelName}`
Run using python object_tracker.py {modelName}.
''',
formatter_class=argparse.RawTextHelpFormatter)
return parser.parse_args()
def createTracker(model_name, args):
if model_name == 'dasiamrpn':
print("Using Dasiamrpn Tracker.")
params = cv.TrackerDaSiamRPN_Params()
params.model = findModel(args.dasiamrpn_model, args.dasiamrpn_sha1)
params.kernel_cls1 = findModel(args.dasiamrpn_kernel_cls_model, args.dasiamrpn_kernel_cls_sha1)
params.kernel_r1 = findModel(args.dasiamrpn_kernel_r1_model, args.dasiamrpn_kernel_r1_sha1)
tracker = cv.TrackerDaSiamRPN_create(params)
elif model_name == 'nanotrack':
print("Using Nano Tracker.")
params = cv.TrackerNano_Params()
params.backbone = findModel(args.nanotrack_back_model, args.nanotrack_back_sha1)
params.neckhead = findModel(args.nanotrack_head_model, args.nanotrack_head_sha1)
tracker = cv.TrackerNano_create(params)
elif model_name == 'vit':
print("Using Vit Tracker.")
params = cv.TrackerVit_Params()
params.net = findModel(args.model, args.sha1)
tracker = cv.TrackerVit_create(params)
else:
help()
exit(-1)
return tracker
def main(model_name, args):
tracker = createTracker(model_name, args)
videoPath = args.input
print('Using video: {}'.format(videoPath))
cap = cv.VideoCapture(cv.samples.findFile(args.input) if args.input else 0)
if not cap.isOpened():
print("Can't open video stream: {}".format(videoPath))
exit(-1)
stdSize = 0.6
stdWeight = 2
stdImgSize = 512
imgWidth = -1 # Initialization
fontSize = 1.5
fontThickness = 1
alpha = 0.5
windowName = "TRACKING"
cv.namedWindow(windowName, cv.WINDOW_NORMAL)
while True:
ret, image = cap.read()
if not ret:
print("Video completed!!")
return -1
if imgWidth == -1:
imgWidth = min(image.shape[:2])
fontSize = min(fontSize, (stdSize*imgWidth)/stdImgSize)
fontThickness = max(fontThickness,(stdWeight*imgWidth)//stdImgSize)
label = "Press space bar to pause video to draw bounding box."
labelSize, _ = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, fontSize, fontThickness)
org_img = image.copy()
cv.rectangle(image, (0, 0), (labelSize[0]+10, labelSize[1]+int(40*fontSize)), (255,255,255), cv.FILLED)
cv.addWeighted(image, alpha, org_img, 1 - alpha, 0, image)
cv.putText(image, label, (10, int(25*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.putText(image, "Press space bar after selecting.", (10, int(55*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.imshow(windowName, image)
key = cv.waitKey(30) & 0xFF
if key == ord(' '):
bbox = cv.selectROI(windowName, image)
print('ROI: {}'.format(bbox))
if bbox != (0, 0, 0, 0):
break
if key == ord('q') or key == 27:
return
try:
tracker.init(image, bbox)
except Exception as e:
print('Unable to initialize tracker with requested bounding box. Is there any object?')
print(e)
tick_meter = cv.TickMeter()
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if imgWidth == -1:
imgWidth = min(frame.shape[:2])
fontSize = min(fontSize, (stdSize*imgWidth)/stdImgSize)
fontThickness = max(fontThickness,(stdWeight*imgWidth)//stdImgSize)
label="Press space bar to select new target"
labelSize, _ = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, fontSize, fontThickness)
tick_meter.reset()
tick_meter.start()
ok, newbox = tracker.update(frame)
tick_meter.stop()
score = tracker.getTrackingScore()
render_image = frame.copy()
key = cv.waitKey(30) & 0xFF
h, w = frame.shape[:2]
cv.rectangle(render_image, (0, 0), (labelSize[0]+10, labelSize[1]+int(100*fontSize)), (255,255,255), cv.FILLED)
cv.rectangle(render_image, (0, int(h-45*fontSize)), (w, h), (255,255,255), cv.FILLED)
cv.addWeighted(render_image, alpha, frame, 1 - alpha, 0, render_image)
cv.putText(render_image, label, (10, int(25*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.putText(render_image, "For switching between trackers: press 'v' for ViT, 'n' for Nanotrack, and 'd' for DaSiamRPN.", (10, h-10), cv.FONT_HERSHEY_SIMPLEX, 0.8*fontSize, (0, 0, 0), fontThickness)
if ok:
if key == ord(' '):
cv.putText(render_image, "Select the new target", (10, int(55*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
bbox = cv.selectROI(windowName, render_image)
print('ROI:', bbox)
if bbox != (0, 0, 0, 0):
tracker.init(frame, bbox)
elif key == ord('v'):
model_name = "vit"
args = load_parser(model_name)
tracker = createTracker(model_name, args)
tracker.init(frame, newbox)
elif key == ord('n'):
model_name = "nanotrack"
args = load_parser(model_name)
tracker = createTracker(model_name, args)
tracker.init(frame, newbox)
elif key == ord('d'):
model_name = "dasiamrpn"
args = load_parser(model_name)
tracker = createTracker(model_name, args)
tracker.init(frame, newbox)
elif key == ord('q') or key == 27:
return
cv.rectangle(render_image, newbox, (200, 0, 0), thickness=2)
time_label = f"FPS: {tick_meter.getFPS():.2f}"
score_label = f"Tracking score: {score:.2f}"
algo_label = f"Algorithm: {model_name}"
cv.putText(render_image, time_label, (10, int(55*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.putText(render_image, score_label, (10, int(85*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.putText(render_image, algo_label, (10, int(115*fontSize)), cv.FONT_HERSHEY_SIMPLEX, fontSize, (0, 0, 0), fontThickness)
cv.imshow(windowName, render_image)
if key in [ord('q'), 27]:
break
if __name__ == '__main__':
help()
if len(sys.argv) < 2 or sys.argv[1].startswith("--"):
model_name = "vit"
else:
model_name = sys.argv[1]
args = load_parser(model_name)
main(model_name, args)
cv.destroyAllWindows()

View File

@ -1,183 +0,0 @@
// VitTracker
// model: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
#include <iostream>
#include <cmath>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/video.hpp>
using namespace cv;
using namespace cv::dnn;
const char *keys =
"{ help h | | Print help message }"
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
"{ net | vitTracker.onnx | Path to onnx model of vitTracker.onnx}"
"{ tracking_score_threshold t | 0.3 | Tracking score threshold. If a bbox of score >= 0.3, it is considered as found }"
"{ backend | 0 | Choose one of computation backends: "
"0: automatically (by default), "
"1: Halide language (http://halide-lang.org/), "
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"3: OpenCV implementation, "
"4: VKCOM, "
"5: CUDA },"
"{ target | 0 | Choose one of target computation devices: "
"0: CPU target (by default), "
"1: OpenCL, "
"2: OpenCL fp16 (half-float precision), "
"3: VPU, "
"4: Vulkan, "
"6: CUDA, "
"7: CUDA fp16 (half-float preprocess) }"
;
static
int run(int argc, char** argv)
{
// Parse command line arguments.
CommandLineParser parser(argc, argv, keys);
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
std::string inputName = parser.get<String>("input");
std::string net = parser.get<String>("net");
int backend = parser.get<int>("backend");
int target = parser.get<int>("target");
float tracking_score_threshold = parser.get<float>("tracking_score_threshold");
Ptr<TrackerVit> tracker;
try
{
TrackerVit::Params params;
params.net = samples::findFile(net);
params.backend = backend;
params.target = target;
params.tracking_score_threshold = tracking_score_threshold;
tracker = TrackerVit::create(params);
}
catch (const cv::Exception& ee)
{
std::cerr << "Exception: " << ee.what() << std::endl;
std::cout << "Can't load the network by using the following files:" << std::endl;
std::cout << "net : " << net << std::endl;
return 2;
}
const std::string winName = "vitTracker";
namedWindow(winName, WINDOW_AUTOSIZE);
// Open a video file or an image file or a camera stream.
VideoCapture cap;
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
{
int c = inputName.empty() ? 0 : inputName[0] - '0';
std::cout << "Trying to open camera #" << c << " ..." << std::endl;
if (!cap.open(c))
{
std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
return 2;
}
}
else if (inputName.size())
{
inputName = samples::findFileOrKeep(inputName);
if (!cap.open(inputName))
{
std::cout << "Could not open: " << inputName << std::endl;
return 2;
}
}
// Read the first image.
Mat image;
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame!" << std::endl;
return 2;
}
Mat image_select = image.clone();
putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
Rect selectRect = selectROI(winName, image_select);
std::cout << "ROI=" << selectRect << std::endl;
if (selectRect.empty())
{
std::cerr << "Invalid ROI!" << std::endl;
return 2;
}
tracker->init(image, selectRect);
TickMeter tickMeter;
for (int count = 0; ; ++count)
{
cap >> image;
if (image.empty())
{
std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
break;
}
Rect rect;
tickMeter.start();
bool ok = tracker->update(image, rect);
tickMeter.stop();
float score = tracker->getTrackingScore();
std::cout << "frame " << count;
if (ok) {
std::cout << ": predicted score=" << score <<
"\trect=" << rect <<
"\ttime=" << tickMeter.getTimeMilli() << "ms" << std::endl;
rectangle(image, rect, Scalar(0, 255, 0), 2);
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
std::string scoreLabel = format("Score: %f", score);
putText(image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
} else {
std::cout << ": target lost" << std::endl;
putText(image, "Target lost", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 255));
}
imshow(winName, image);
tickMeter.reset();
int c = waitKey(1);
if (c == 27 /*ESC*/ || c == 'q' || c == 'Q')
break;
}
std::cout << "Exit" << std::endl;
return 0;
}
int main(int argc, char **argv)
{
try
{
return run(argc, argv);
}
catch (const std::exception& e)
{
std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
return 1;
}
}

View File

@ -1,173 +0,0 @@
#!/usr/bin/env python
'''
Tracker demo
For usage download models by following links
For DaSiamRPN:
network: https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
kernel_r1: https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
For NanoTrack:
nanotrack_backbone: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/nanotrackv2/nanotrack_backbone_sim.onnx
nanotrack_headneck: https://github.com/HonglinChu/SiamTrackers/blob/master/NanoTrack/models/nanotrackv2/nanotrack_head_sim.onnx
For VitTrack:
vitTracker: https://github.com/opencv/opencv_zoo/raw/fef72f8fa7c52eaf116d3df358d24e6e959ada0e/models/object_tracking_vittrack/object_tracking_vittrack_2023sep.onnx
USAGE:
tracker.py [-h] [--input INPUT_VIDEO]
[--tracker_algo TRACKER_ALGO mil, dasiamrpn, nanotrack, vittrack]
[--dasiamrpn_net DASIAMRPN_NET]
[--dasiamrpn_kernel_r1 DASIAMRPN_KERNEL_R1]
[--dasiamrpn_kernel_cls1 DASIAMRPN_KERNEL_CLS1]
[--dasiamrpn_backend DASIAMRPN_BACKEND]
[--dasiamrpn_target DASIAMRPN_TARGET]
[--nanotrack_backbone NANOTRACK_BACKBONE]
[--nanotrack_headneck NANOTRACK_TARGET]
[--vittrack_net VITTRACK_MODEL]
[--vittrack_net VITTRACK_MODEL]
[--tracking_score_threshold TRACKING SCORE THRESHOLD FOR ONLY VITTRACK]
[--backend CHOOSE ONE OF COMPUTATION BACKEND]
[--target CHOOSE ONE OF COMPUTATION TARGET]
'''
# Python 2/3 compatibility
from __future__ import print_function
import sys
import numpy as np
import cv2 as cv
import argparse
from video import create_capture, presets
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
class App(object):
def __init__(self, args):
self.args = args
self.trackerAlgorithm = args.tracker_algo
self.tracker = self.createTracker()
def createTracker(self):
if self.trackerAlgorithm == 'mil':
tracker = cv.TrackerMIL_create()
elif self.trackerAlgorithm == 'dasiamrpn':
params = cv.TrackerDaSiamRPN_Params()
params.model = self.args.dasiamrpn_net
params.kernel_cls1 = self.args.dasiamrpn_kernel_cls1
params.kernel_r1 = self.args.dasiamrpn_kernel_r1
params.backend = args.backend
params.target = args.target
tracker = cv.TrackerDaSiamRPN_create(params)
elif self.trackerAlgorithm == 'nanotrack':
params = cv.TrackerNano_Params()
params.backbone = args.nanotrack_backbone
params.neckhead = args.nanotrack_headneck
params.backend = args.backend
params.target = args.target
tracker = cv.TrackerNano_create(params)
elif self.trackerAlgorithm == 'vittrack':
params = cv.TrackerVit_Params()
params.net = args.vittrack_net
params.tracking_score_threshold = args.tracking_score_threshold
params.backend = args.backend
params.target = args.target
tracker = cv.TrackerVit_create(params)
else:
sys.exit("Tracker {} is not recognized. Please use one of three available: mil, dasiamrpn, nanotrack.".format(self.trackerAlgorithm))
return tracker
def initializeTracker(self, image):
while True:
print('==> Select object ROI for tracker ...')
bbox = cv.selectROI('tracking', image)
print('ROI: {}'.format(bbox))
if bbox[2] <= 0 or bbox[3] <= 0:
sys.exit("ROI selection cancelled. Exiting...")
try:
self.tracker.init(image, bbox)
except Exception as e:
print('Unable to initialize tracker with requested bounding box. Is there any object?')
print(e)
print('Try again ...')
continue
return
def run(self):
videoPath = self.args.input
print('Using video: {}'.format(videoPath))
camera = create_capture(cv.samples.findFileOrKeep(videoPath), presets['cube'])
if not camera.isOpened():
sys.exit("Can't open video stream: {}".format(videoPath))
ok, image = camera.read()
if not ok:
sys.exit("Can't read first frame")
assert image is not None
cv.namedWindow('tracking')
self.initializeTracker(image)
print("==> Tracking is started. Press 'SPACE' to re-initialize tracker or 'ESC' for exit...")
while camera.isOpened():
ok, image = camera.read()
if not ok:
print("Can't read frame")
break
ok, newbox = self.tracker.update(image)
#print(ok, newbox)
if ok:
cv.rectangle(image, newbox, (200,0,0))
cv.imshow("tracking", image)
k = cv.waitKey(1)
if k == 32: # SPACE
self.initializeTracker(image)
if k == 27: # ESC
break
print('Done')
if __name__ == '__main__':
print(__doc__)
parser = argparse.ArgumentParser(description="Run tracker")
parser.add_argument("--input", type=str, default="vtest.avi", help="Path to video source")
parser.add_argument("--tracker_algo", type=str, default="nanotrack", help="One of available tracking algorithms: mil, dasiamrpn, nanotrack, vittrack")
parser.add_argument("--dasiamrpn_net", type=str, default="dasiamrpn_model.onnx", help="Path to onnx model of DaSiamRPN net")
parser.add_argument("--dasiamrpn_kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Path to onnx model of DaSiamRPN kernel_r1")
parser.add_argument("--dasiamrpn_kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Path to onnx model of DaSiamRPN kernel_cls1")
parser.add_argument("--nanotrack_backbone", type=str, default="nanotrack_backbone_sim.onnx", help="Path to onnx model of NanoTrack backBone")
parser.add_argument("--nanotrack_headneck", type=str, default="nanotrack_head_sim.onnx", help="Path to onnx model of NanoTrack headNeck")
parser.add_argument("--vittrack_net", type=str, default="vitTracker.onnx", help="Path to onnx model of vittrack")
parser.add_argument('--tracking_score_threshold', type=float, help="Tracking score threshold. If a bbox of score >= 0.3, it is considered as found ")
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
help="Choose one of computation backends: "
"%d: automatically (by default), "
"%d: Halide language (http://halide-lang.org/), "
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"%d: OpenCV implementation, "
"%d: VKCOM, "
"%d: CUDA"% backends)
parser.add_argument("--target", choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
help="Choose one of target computation devices: "
'%d: CPU target (by default), '
'%d: OpenCL, '
'%d: OpenCL fp16 (half-float precision), '
'%d: VPU, '
'%d: VULKAN, '
'%d: CUDA, '
'%d: CUDA fp16 (half-float preprocess)'% targets)
args = parser.parse_args()
App(args).run()
cv.destroyAllWindows()