mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
Merge pull request #18033 from ieliz:dasiamrpn
Improving DaSiamRPN tracker sample * changed layerBlobs in dnn.cpp and added DaSiamRPN tracker * Improving DaSiamRPN tracker sample * Docs fix * Removed outdated changes * Trying to reinitialize tracker without reloading models. Worked with LaSOT-based benchmark with reinit rate=250 frames * Trying to reverse changes * Moving the model in the constructor * Fixing some issues with names * Variable name changed * Reverse parser arguments changes
This commit is contained in:
parent
fe9ff64d64
commit
7ec221e734
@ -14,8 +14,8 @@ import argparse
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
class DaSiamRPNTracker:
|
class DaSiamRPNTracker:
|
||||||
#initialization of used values, initial bounding box, used network
|
# Initialization of used values, initial bounding box, used network
|
||||||
def __init__(self, im, target_pos, target_sz, net, kernel_r1, kernel_cls1):
|
def __init__(self, net="dasiamrpn_model.onnx", kernel_r1="dasiamrpn_kernel_r1.onnx", kernel_cls1="dasiamrpn_kernel_cls1.onnx"):
|
||||||
self.windowing = "cosine"
|
self.windowing = "cosine"
|
||||||
self.exemplar_size = 127
|
self.exemplar_size = 127
|
||||||
self.instance_size = 271
|
self.instance_size = 271
|
||||||
@ -28,42 +28,52 @@ class DaSiamRPNTracker:
|
|||||||
self.penalty_k = 0.055
|
self.penalty_k = 0.055
|
||||||
self.window_influence = 0.42
|
self.window_influence = 0.42
|
||||||
self.lr = 0.295
|
self.lr = 0.295
|
||||||
self.im_h = im.shape[0]
|
|
||||||
self.im_w = im.shape[1]
|
|
||||||
self.target_pos = target_pos
|
|
||||||
self.target_sz = target_sz
|
|
||||||
self.avg_chans = np.mean(im, axis=(0, 1))
|
|
||||||
self.net = net
|
|
||||||
self.score = []
|
self.score = []
|
||||||
|
|
||||||
if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
|
|
||||||
raise AssertionError("Initializing BB is too small-try to restart tracker with larger BB")
|
|
||||||
|
|
||||||
self.anchor = self.__generate_anchor()
|
|
||||||
wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
|
||||||
hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
|
||||||
s_z = round(np.sqrt(wc_z * hc_z))
|
|
||||||
|
|
||||||
z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
|
|
||||||
z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
|
|
||||||
self.net.setInput(z_crop)
|
|
||||||
z_f = self.net.forward('63')
|
|
||||||
kernel_r1.setInput(z_f)
|
|
||||||
r1 = kernel_r1.forward()
|
|
||||||
kernel_cls1.setInput(z_f)
|
|
||||||
cls1 = kernel_cls1.forward()
|
|
||||||
r1 = r1.reshape(20, 256, 4, 4)
|
|
||||||
cls1 = cls1.reshape(10, 256 , 4, 4)
|
|
||||||
self.net.setParam(self.net.getLayerId('65'), 0, r1)
|
|
||||||
self.net.setParam(self.net.getLayerId('68'), 0, cls1)
|
|
||||||
|
|
||||||
if self.windowing == "cosine":
|
if self.windowing == "cosine":
|
||||||
self.window = np.outer(np.hanning(self.score_size), np.hanning(self.score_size))
|
self.window = np.outer(np.hanning(self.score_size), np.hanning(self.score_size))
|
||||||
elif self.windowing == "uniform":
|
elif self.windowing == "uniform":
|
||||||
self.window = np.ones((self.score_size, self.score_size))
|
self.window = np.ones((self.score_size, self.score_size))
|
||||||
self.window = np.tile(self.window.flatten(), self.anchor_num)
|
self.window = np.tile(self.window.flatten(), self.anchor_num)
|
||||||
|
# Loading network`s and kernel`s models
|
||||||
|
self.net = cv.dnn.readNet(net)
|
||||||
|
self.kernel_r1 = cv.dnn.readNet(kernel_r1)
|
||||||
|
self.kernel_cls1 = cv.dnn.readNet(kernel_cls1)
|
||||||
|
|
||||||
#creating anchor for tracking bounding box
|
def init(self, im, init_bb):
|
||||||
|
target_pos, target_sz = np.array([init_bb[0], init_bb[1]]), np.array([init_bb[2], init_bb[3]])
|
||||||
|
self.im_h = im.shape[0]
|
||||||
|
self.im_w = im.shape[1]
|
||||||
|
self.target_pos = target_pos
|
||||||
|
self.target_sz = target_sz
|
||||||
|
self.avg_chans = np.mean(im, axis=(0, 1))
|
||||||
|
|
||||||
|
# When we trying to generate ONNX model from the pre-trained .pth model
|
||||||
|
# we are using only one state of the network. In our case used state
|
||||||
|
# with big bounding box, so we were forced to add assertion for
|
||||||
|
# too small bounding boxes - current state of the network can not
|
||||||
|
# work properly with such small bounding boxes
|
||||||
|
if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
|
||||||
|
raise AssertionError(
|
||||||
|
"Initializing BB is too small-try to restart tracker with larger BB")
|
||||||
|
|
||||||
|
self.anchor = self.__generate_anchor()
|
||||||
|
wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
||||||
|
hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
||||||
|
s_z = round(np.sqrt(wc_z * hc_z))
|
||||||
|
z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
|
||||||
|
z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
|
||||||
|
self.net.setInput(z_crop)
|
||||||
|
z_f = self.net.forward('63')
|
||||||
|
self.kernel_r1.setInput(z_f)
|
||||||
|
r1 = self.kernel_r1.forward()
|
||||||
|
self.kernel_cls1.setInput(z_f)
|
||||||
|
cls1 = self.kernel_cls1.forward()
|
||||||
|
r1 = r1.reshape(20, 256, 4, 4)
|
||||||
|
cls1 = cls1.reshape(10, 256 , 4, 4)
|
||||||
|
self.net.setParam(self.net.getLayerId('65'), 0, r1)
|
||||||
|
self.net.setParam(self.net.getLayerId('68'), 0, cls1)
|
||||||
|
|
||||||
|
# Сreating anchor for tracking bounding box
|
||||||
def __generate_anchor(self):
|
def __generate_anchor(self):
|
||||||
self.anchor = np.zeros((self.anchor_num, 4), dtype = np.float32)
|
self.anchor = np.zeros((self.anchor_num, 4), dtype = np.float32)
|
||||||
size = self.total_stride * self.total_stride
|
size = self.total_stride * self.total_stride
|
||||||
@ -86,8 +96,8 @@ class DaSiamRPNTracker:
|
|||||||
self.anchor[:, 0], self.anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
|
self.anchor[:, 0], self.anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
|
||||||
return self.anchor
|
return self.anchor
|
||||||
|
|
||||||
#track function
|
# Function for updating tracker state
|
||||||
def track(self, im):
|
def update(self, im):
|
||||||
wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
||||||
hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
||||||
s_z = np.sqrt(wc_z * hc_z)
|
s_z = np.sqrt(wc_z * hc_z)
|
||||||
@ -96,7 +106,7 @@ class DaSiamRPNTracker:
|
|||||||
pad = d_search / scale_z
|
pad = d_search / scale_z
|
||||||
s_x = round(s_z + 2 * pad)
|
s_x = round(s_z + 2 * pad)
|
||||||
|
|
||||||
#region preprocessing
|
# Region preprocessing part
|
||||||
x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
|
x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
|
||||||
x_crop = x_crop.transpose(2, 0, 1).reshape(1, 3, 271, 271).astype(np.float32)
|
x_crop = x_crop.transpose(2, 0, 1).reshape(1, 3, 271, 271).astype(np.float32)
|
||||||
self.score = self.__tracker_eval(x_crop, scale_z)
|
self.score = self.__tracker_eval(x_crop, scale_z)
|
||||||
@ -105,7 +115,12 @@ class DaSiamRPNTracker:
|
|||||||
self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
|
self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
|
||||||
self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))
|
self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))
|
||||||
|
|
||||||
#update bounding box position
|
cx, cy = self.target_pos
|
||||||
|
w, h = self.target_sz
|
||||||
|
updated_bb = (cx, cy, w, h)
|
||||||
|
return True, updated_bb
|
||||||
|
|
||||||
|
# Function for updating position of the bounding box
|
||||||
def __tracker_eval(self, x_crop, scale_z):
|
def __tracker_eval(self, x_crop, scale_z):
|
||||||
target_size = self.target_sz * scale_z
|
target_size = self.target_sz * scale_z
|
||||||
self.net.setInput(x_crop)
|
self.net.setInput(x_crop)
|
||||||
@ -160,7 +175,7 @@ class DaSiamRPNTracker:
|
|||||||
y = e_x / e_x.sum(axis = 0)
|
y = e_x / e_x.sum(axis = 0)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
#evaluations with cropped image
|
# Reshaping cropped image for using in the model
|
||||||
def __get_subwindow_tracking(self, im, model_size, original_sz):
|
def __get_subwindow_tracking(self, im, model_size, original_sz):
|
||||||
im_sz = im.shape
|
im_sz = im.shape
|
||||||
c = (original_sz + 1) / 2
|
c = (original_sz + 1) / 2
|
||||||
@ -171,19 +186,20 @@ class DaSiamRPNTracker:
|
|||||||
left_pad = int(max(0., -context_xmin))
|
left_pad = int(max(0., -context_xmin))
|
||||||
top_pad = int(max(0., -context_ymin))
|
top_pad = int(max(0., -context_ymin))
|
||||||
right_pad = int(max(0., context_xmax - im_sz[1] + 1))
|
right_pad = int(max(0., context_xmax - im_sz[1] + 1))
|
||||||
bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))
|
bot_pad = int(max(0., context_ymax - im_sz[0] + 1))
|
||||||
context_xmin += left_pad
|
context_xmin += left_pad
|
||||||
context_xmax += left_pad
|
context_xmax += left_pad
|
||||||
context_ymin += top_pad
|
context_ymin += top_pad
|
||||||
context_ymax += top_pad
|
context_ymax += top_pad
|
||||||
r, c, k = im.shape
|
r, c, k = im.shape
|
||||||
|
|
||||||
if any([top_pad, bottom_pad, left_pad, right_pad]):
|
if any([top_pad, bot_pad, left_pad, right_pad]):
|
||||||
te_im = np.zeros((r + top_pad + bottom_pad, c + left_pad + right_pad, k), np.uint8)
|
te_im = np.zeros((
|
||||||
|
r + top_pad + bot_pad, c + left_pad + right_pad, k), np.uint8)
|
||||||
te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
|
te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
|
||||||
if top_pad:
|
if top_pad:
|
||||||
te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
|
te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
|
||||||
if bottom_pad:
|
if bot_pad:
|
||||||
te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
|
te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
|
||||||
if left_pad:
|
if left_pad:
|
||||||
te_im[:, 0:left_pad, :] = self.avg_chans
|
te_im[:, 0:left_pad, :] = self.avg_chans
|
||||||
@ -195,23 +211,22 @@ class DaSiamRPNTracker:
|
|||||||
|
|
||||||
if not np.array_equal(model_size, original_sz):
|
if not np.array_equal(model_size, original_sz):
|
||||||
im_patch_original = cv.resize(im_patch_original, (model_size, model_size))
|
im_patch_original = cv.resize(im_patch_original, (model_size, model_size))
|
||||||
|
|
||||||
return im_patch_original
|
return im_patch_original
|
||||||
|
|
||||||
#function for reading paths, bounding box drawing, showing results
|
# Sample for using DaSiamRPN tracker
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Run tracker")
|
parser = argparse.ArgumentParser(description="Run tracker")
|
||||||
|
parser.add_argument("--input", type=str, help="Full path to input (empty for camera)")
|
||||||
parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx", help="Full path to onnx model of net")
|
parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx", help="Full path to onnx model of net")
|
||||||
parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Full path to onnx model of kernel_r1")
|
parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Full path to onnx model of kernel_r1")
|
||||||
parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Full path to onnx model of kernel_cls1")
|
parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Full path to onnx model of kernel_cls1")
|
||||||
parser.add_argument("--input", type=str, help="Full path to input. Do not use if input is camera")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
point1 = ()
|
point1 = ()
|
||||||
point2 = ()
|
point2 = ()
|
||||||
mark = True
|
mark = True
|
||||||
drawing = False
|
drawing = False
|
||||||
cx, cy, w, h = 0.0, 0.0, 0, 0
|
cx, cy, w, h = 0.0, 0.0, 0, 0
|
||||||
|
# Fucntion for drawing during videostream
|
||||||
def get_bb(event, x, y, flag, param):
|
def get_bb(event, x, y, flag, param):
|
||||||
nonlocal point1, point2, cx, cy, w, h, drawing, mark
|
nonlocal point1, point2, cx, cy, w, h, drawing, mark
|
||||||
|
|
||||||
@ -233,12 +248,7 @@ def main():
|
|||||||
h = abs(point1[1] - point2[1])
|
h = abs(point1[1] - point2[1])
|
||||||
mark = False
|
mark = False
|
||||||
|
|
||||||
#loading network`s and kernel`s models
|
# Creating window for visualization
|
||||||
net = cv.dnn.readNet(args.net)
|
|
||||||
kernel_r1 = cv.dnn.readNet(args.kernel_r1)
|
|
||||||
kernel_cls1 = cv.dnn.readNet(args.kernel_cls1)
|
|
||||||
|
|
||||||
#initializing bounding box
|
|
||||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||||
cv.namedWindow("DaSiamRPN")
|
cv.namedWindow("DaSiamRPN")
|
||||||
cv.setMouseCallback("DaSiamRPN", get_bb)
|
cv.setMouseCallback("DaSiamRPN", get_bb)
|
||||||
@ -257,17 +267,17 @@ def main():
|
|||||||
cv.imshow("DaSiamRPN", twin)
|
cv.imshow("DaSiamRPN", twin)
|
||||||
cv.waitKey(40)
|
cv.waitKey(40)
|
||||||
|
|
||||||
target_pos, target_sz = np.array([cx, cy]), np.array([w, h])
|
init_bb = (cx, cy, w, h)
|
||||||
tracker = DaSiamRPNTracker(frame, target_pos, target_sz, net, kernel_r1, kernel_cls1)
|
tracker = DaSiamRPNTracker(args.net, args.kernel_r1, args.kernel_cls1)
|
||||||
|
tracker.init(frame, init_bb)
|
||||||
|
|
||||||
#tracking loop
|
# Tracking loop
|
||||||
while cap.isOpened():
|
while cap.isOpened():
|
||||||
has_frame, frame = cap.read()
|
has_frame, frame = cap.read()
|
||||||
if not has_frame:
|
if not has_frame:
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
tracker.track(frame)
|
_, new_bb = tracker.update(frame)
|
||||||
w, h = tracker.target_sz
|
cx, cy, w, h = new_bb
|
||||||
cx, cy = tracker.target_pos
|
|
||||||
cv.rectangle(frame, (int(cx - w // 2), int(cy - h // 2)), (int(cx - w // 2) + int(w), int(cy - h // 2) + int(h)),(0, 255, 255), 3)
|
cv.rectangle(frame, (int(cx - w // 2), int(cy - h // 2)), (int(cx - w // 2) + int(w), int(cy - h // 2) + int(h)),(0, 255, 255), 3)
|
||||||
cv.imshow("DaSiamRPN", frame)
|
cv.imshow("DaSiamRPN", frame)
|
||||||
key = cv.waitKey(1)
|
key = cv.waitKey(1)
|
||||||
|
Loading…
Reference in New Issue
Block a user