video: moved CSRT tracking algorithm from opencv_contrib

2025-06-07 09:25:45 +08:00 · 2025-05-17 18:02:19 +03:00 · 2025-05-17 18:02:19 +03:00 · 27da0bb829
commit 27da0bb829
parent 250b5003ee
11 changed files with 2117 additions and 0 deletions
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@ -1553,3 +1553,9 @@
  year = {2014},
  url = {http://www.marcozuliani.com/docs/RANSAC4Dummies.pdf}
 }
+@article{Lukezic_IJCV2018,
+  author={Luke{\v{z}}i{\v{c}}, Alan and Voj{'i}{\v{r}}, Tom{'a}{\v{s}} and {\v{C}}ehovin Zajc, Luka and Matas, Ji{\v{r}}{'i} and Kristan, Matej},
+  title={Discriminative Correlation Filter Tracker with Channel and Spatial Reliability},
+  journal={International Journal of Computer Vision},
+  year={2018},
+}
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@ -978,8 +978,69 @@ public:
    // bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
 };

+
+/** @brief the CSRT tracker
+
+The implementation is based on @cite Lukezic_IJCV2018 Discriminative Correlation Filter with Channel and Spatial Reliability
+*/
+class CV_EXPORTS_W TrackerCSRT : public Tracker
+{
+protected:
+    TrackerCSRT();  // use ::create()
+public:
+    virtual ~TrackerCSRT() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+
+        CV_PROP_RW bool use_hog;
+        CV_PROP_RW bool use_color_names;
+        CV_PROP_RW bool use_gray;
+        CV_PROP_RW bool use_rgb;
+        CV_PROP_RW bool use_channel_weights;
+        CV_PROP_RW bool use_segmentation;
+
+        CV_PROP_RW std::string window_function; //!<  Window function: "hann", "cheb", "kaiser"
+        CV_PROP_RW float kaiser_alpha;
+        CV_PROP_RW float cheb_attenuation;
+
+        CV_PROP_RW float template_size;
+        CV_PROP_RW float gsl_sigma;
+        CV_PROP_RW float hog_orientations;
+        CV_PROP_RW float hog_clip;
+        CV_PROP_RW float padding;
+        CV_PROP_RW float filter_lr;
+        CV_PROP_RW float weights_lr;
+        CV_PROP_RW int num_hog_channels_used;
+        CV_PROP_RW int admm_iterations;
+        CV_PROP_RW int histogram_bins;
+        CV_PROP_RW float histogram_lr;
+        CV_PROP_RW int background_ratio;
+        CV_PROP_RW int number_of_scales;
+        CV_PROP_RW float scale_sigma_factor;
+        CV_PROP_RW float scale_model_max_area;
+        CV_PROP_RW float scale_lr;
+        CV_PROP_RW float scale_step;
+
+        CV_PROP_RW float psr_threshold; //!< we lost the target, if the psr is lower than this.
+    };
+
+    /** @brief Create CSRT tracker instance
+    @param parameters CSRT parameters TrackerCSRT::Params
+    */
+    static CV_WRAP Ptr<TrackerCSRT> create(const TrackerCSRT::Params &parameters = TrackerCSRT::Params());
+
+    CV_WRAP virtual void setInitialMask(InputArray mask) = 0;
+
+    //! Return estimated tracking confidence
+    CV_WRAP virtual float getTrackingScore() const = 0;
+};
+
+
 //! @} video_track

 } // cv

+
 #endif
--- a/modules/video/misc/python/pyopencv_video.hpp
+++ b/modules/video/misc/python/pyopencv_video.hpp
@ -1,5 +1,6 @@
 #ifdef HAVE_OPENCV_VIDEO
 typedef TrackerMIL::Params TrackerMIL_Params;
+typedef TrackerCSRT::Params TrackerCSRT_Params;
 typedef TrackerGOTURN::Params TrackerGOTURN_Params;
 typedef TrackerDaSiamRPN::Params TrackerDaSiamRPN_Params;
 typedef TrackerNano::Params TrackerNano_Params;
--- a/modules/video/src/tracking/tracker_csrt.cpp
+++ b/modules/video/src/tracking/tracker_csrt.cpp
@ -0,0 +1,654 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#include "opencv2/video/detail/tracking.detail.hpp"
+
+#include "tracker_csrt_segmentation.hpp"
+#include "tracker_csrt_utils.hpp"
+#include "tracker_csrt_scale_estimation.hpp"
+
+namespace cv {
+namespace detail {
+inline namespace tracking {
+
+/**
+* \brief Implementation of TrackerModel for CSRT algorithm
+*/
+class TrackerCSRTModel CV_FINAL : public TrackerModel
+{
+public:
+    TrackerCSRTModel(){}
+    ~TrackerCSRTModel(){}
+protected:
+    void modelEstimationImpl(const std::vector<Mat>& /*responses*/) CV_OVERRIDE {}
+    void modelUpdateImpl() CV_OVERRIDE {}
+};
+
+class TrackerCSRTImpl CV_FINAL : public TrackerCSRT
+{
+public:
+    TrackerCSRTImpl(const Params &parameters = Params());
+
+    Params params;
+
+    Ptr<TrackerCSRTModel> model;
+
+    double last_score;
+
+    // Tracker API
+    virtual void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    virtual bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE;
+    virtual void setInitialMask(InputArray mask) CV_OVERRIDE;
+    virtual float getTrackingScore() const CV_OVERRIDE;
+
+protected:
+    void update_csr_filter(const Mat &image, const Mat &my_mask);
+    void update_histograms(const Mat &image, const Rect &region);
+    void extract_histograms(const Mat &image, cv::Rect region, Histogram &hf, Histogram &hb);
+    std::vector<Mat> create_csr_filter(const std::vector<cv::Mat>
+            img_features, const cv::Mat Y, const cv::Mat P);
+    Mat calculate_response(const Mat &image, const std::vector<Mat> filter);
+    Mat get_location_prior(const Rect roi, const Size2f target_size, const Size img_sz);
+    Mat segment_region(const Mat &image, const Size &target_size, float scale_factor);
+    Point2f estimate_new_position(const Mat &image);
+    std::vector<Mat> get_features(const Mat &patch, const Size2i &feature_size);
+
+    bool check_mask_area(const Mat &mat, const double obj_area);
+    float current_scale_factor;
+    Mat window;
+    Mat yf;
+    Rect2f bounding_box;
+    std::vector<Mat> csr_filter;
+    std::vector<float> filter_weights;
+    Size2f original_target_size;
+    Size2i image_size;
+    Size2f template_size;
+    Size2i rescaled_template_size;
+    float rescale_ratio;
+    Point2f object_center;
+    DSST dsst;
+    Histogram hist_foreground;
+    Histogram hist_background;
+    double p_b;
+    Mat erode_element;
+    Mat filter_mask;
+    Mat preset_mask;
+    Mat default_mask;
+    float default_mask_area;
+    int cell_size;
+};
+
+TrackerCSRTImpl::TrackerCSRTImpl(const TrackerCSRT::Params &parameters) :
+    params(parameters)
+{
+    // nothing
+}
+
+void TrackerCSRTImpl::setInitialMask(InputArray mask)
+{
+    preset_mask = mask.getMat();
+}
+
+bool TrackerCSRTImpl::check_mask_area(const Mat &mat, const double obj_area)
+{
+    double threshold = 0.05;
+    double mask_area= sum(mat)[0];
+    if(mask_area < threshold*obj_area) {
+        return false;
+    }
+    return true;
+}
+
+Mat TrackerCSRTImpl::calculate_response(const Mat &image, const std::vector<Mat> filter)
+{
+    Mat patch = get_subwindow(image, object_center, cvFloor(current_scale_factor * template_size.width),
+        cvFloor(current_scale_factor * template_size.height));
+    resize(patch, patch, rescaled_template_size, 0, 0, INTER_CUBIC);
+
+    std::vector<Mat> ftrs = get_features(patch, yf.size());
+    std::vector<Mat> Ffeatures = fourier_transform_features(ftrs);
+    Mat resp, res;
+    if(params.use_channel_weights){
+        res = Mat::zeros(Ffeatures[0].size(), CV_32FC2);
+        Mat resp_ch;
+        Mat mul_mat;
+        for(size_t i = 0; i < Ffeatures.size(); ++i) {
+            mulSpectrums(Ffeatures[i], filter[i], resp_ch, 0, true);
+            res += (resp_ch * filter_weights[i]);
+        }
+        idft(res, res, DFT_SCALE | DFT_REAL_OUTPUT);
+    } else {
+        res = Mat::zeros(Ffeatures[0].size(), CV_32FC2);
+        Mat resp_ch;
+        for(size_t i = 0; i < Ffeatures.size(); ++i) {
+            mulSpectrums(Ffeatures[i], filter[i], resp_ch, 0 , true);
+            res = res + resp_ch;
+        }
+        idft(res, res, DFT_SCALE | DFT_REAL_OUTPUT);
+    }
+    return res;
+}
+
+void TrackerCSRTImpl::update_csr_filter(const Mat &image, const Mat &mask)
+{
+    Mat patch = get_subwindow(image, object_center, cvFloor(current_scale_factor * template_size.width),
+        cvFloor(current_scale_factor * template_size.height));
+    resize(patch, patch, rescaled_template_size, 0, 0, INTER_CUBIC);
+
+    std::vector<Mat> ftrs = get_features(patch, yf.size());
+    std::vector<Mat> Fftrs = fourier_transform_features(ftrs);
+    std::vector<Mat> new_csr_filter = create_csr_filter(Fftrs, yf, mask);
+    //calculate per channel weights
+    if(params.use_channel_weights) {
+        Mat current_resp;
+        double max_val;
+        float sum_weights = 0;
+        std::vector<float> new_filter_weights = std::vector<float>(new_csr_filter.size());
+        for(size_t i = 0; i < new_csr_filter.size(); ++i) {
+            mulSpectrums(Fftrs[i], new_csr_filter[i], current_resp, 0, true);
+            idft(current_resp, current_resp, DFT_SCALE | DFT_REAL_OUTPUT);
+            minMaxLoc(current_resp, NULL, &max_val, NULL, NULL);
+            sum_weights += static_cast<float>(max_val);
+            new_filter_weights[i] = static_cast<float>(max_val);
+        }
+        //update filter weights with new values
+        float updated_sum = 0;
+        for(size_t i = 0; i < filter_weights.size(); ++i) {
+            filter_weights[i] = filter_weights[i]*(1.0f - params.weights_lr) +
+                params.weights_lr * (new_filter_weights[i] / sum_weights);
+            updated_sum += filter_weights[i];
+        }
+        //normalize weights
+        for(size_t i = 0; i < filter_weights.size(); ++i) {
+            filter_weights[i] /= updated_sum;
+        }
+    }
+    for(size_t i = 0; i < csr_filter.size(); ++i) {
+        csr_filter[i] = (1.0f - params.filter_lr)*csr_filter[i] + params.filter_lr * new_csr_filter[i];
+    }
+    std::vector<Mat>().swap(ftrs);
+    std::vector<Mat>().swap(Fftrs);
+}
+
+
+std::vector<Mat> TrackerCSRTImpl::get_features(const Mat &patch, const Size2i &feature_size)
+{
+    std::vector<Mat> features;
+    if (params.use_hog) {
+        std::vector<Mat> hog = get_features_hog(patch, cell_size);
+        features.insert(features.end(), hog.begin(),
+                hog.begin()+params.num_hog_channels_used);
+    }
+    // TODO: restore color_names feature mode
+    // if (params.use_color_names) {
+    //     std::vector<Mat> cn;
+    //     cn = get_features_cn(patch, feature_size);
+    //     features.insert(features.end(), cn.begin(), cn.end());
+    // }
+    if(params.use_gray) {
+        Mat gray_m;
+        cvtColor(patch, gray_m, COLOR_BGR2GRAY);
+        resize(gray_m, gray_m, feature_size, 0, 0, INTER_CUBIC);
+        gray_m.convertTo(gray_m, CV_32FC1, 1.0/255.0, -0.5);
+        features.push_back(gray_m);
+    }
+    if(params.use_rgb) {
+        std::vector<Mat> rgb_features = get_features_rgb(patch, feature_size);
+        features.insert(features.end(), rgb_features.begin(), rgb_features.end());
+    }
+
+    for (size_t i = 0; i < features.size(); ++i) {
+        features.at(i) = features.at(i).mul(window);
+    }
+    return features;
+}
+
+class ParallelCreateCSRFilter : public ParallelLoopBody {
+public:
+    ParallelCreateCSRFilter(
+        const std::vector<cv::Mat> img_features_,
+        const cv::Mat Y_,
+        const cv::Mat P_,
+        int admm_iterations_,
+        std::vector<Mat> &result_filter_):
+        result_filter(result_filter_)
+    {
+        this->img_features = img_features_;
+        this->Y = Y_;
+        this->P = P_;
+        this->admm_iterations = admm_iterations_;
+    }
+    virtual void operator ()(const Range& range) const CV_OVERRIDE
+    {
+        for (int i = range.start; i < range.end; i++) {
+            float mu = 5.0f;
+            float beta = 3.0f;
+            float mu_max = 20.0f;
+            float lambda = mu / 100.0f;
+
+            Mat F = img_features[i];
+
+            Mat Sxy, Sxx;
+            mulSpectrums(F, Y, Sxy, 0, true);
+            mulSpectrums(F, F, Sxx, 0, true);
+
+            Mat H;
+            H = divide_complex_matrices(Sxy, (Sxx + lambda));
+            idft(H, H, DFT_SCALE|DFT_REAL_OUTPUT);
+            H = H.mul(P);
+            dft(H, H, DFT_COMPLEX_OUTPUT);
+            Mat L = Mat::zeros(H.size(), H.type()); //Lagrangian multiplier
+            Mat G;
+            for(int iteration = 0; iteration < admm_iterations; ++iteration) {
+                G = divide_complex_matrices((Sxy + (mu * H) - L) , (Sxx + mu));
+                idft((mu * G) + L, H, DFT_SCALE | DFT_REAL_OUTPUT);
+                float lm = 1.0f / (lambda+mu);
+                H = H.mul(P*lm);
+                dft(H, H, DFT_COMPLEX_OUTPUT);
+
+                //Update variables for next iteration
+                L = L + mu * (G - H);
+                mu = min(mu_max, beta*mu);
+            }
+            result_filter[i] = H;
+        }
+    }
+
+    ParallelCreateCSRFilter& operator=(const ParallelCreateCSRFilter &) {
+        return *this;
+    }
+
+private:
+    int admm_iterations;
+    Mat Y;
+    Mat P;
+    std::vector<Mat> img_features;
+    std::vector<Mat> &result_filter;
+};
+
+
+std::vector<Mat> TrackerCSRTImpl::create_csr_filter(
+        const std::vector<cv::Mat> img_features,
+        const cv::Mat Y,
+        const cv::Mat P)
+{
+    std::vector<Mat> result_filter;
+    result_filter.resize(img_features.size());
+    ParallelCreateCSRFilter parallelCreateCSRFilter(img_features, Y, P,
+            params.admm_iterations, result_filter);
+    parallel_for_(Range(0, static_cast<int>(result_filter.size())), parallelCreateCSRFilter);
+
+    return result_filter;
+}
+
+Mat TrackerCSRTImpl::get_location_prior(
+        const Rect roi,
+        const Size2f target_size,
+        const Size img_sz)
+{
+    int x1 = cvRound(max(min(roi.x-1, img_sz.width-1) , 0));
+    int y1 = cvRound(max(min(roi.y-1, img_sz.height-1) , 0));
+
+    int x2 = cvRound(min(max(roi.width-1, 0) , img_sz.width-1));
+    int y2 = cvRound(min(max(roi.height-1, 0) , img_sz.height-1));
+
+    Size target_sz;
+    target_sz.width = target_sz.height = cvFloor(min(target_size.width, target_size.height));
+
+    double cx = x1 + (x2-x1)/2.;
+    double cy = y1 + (y2-y1)/2.;
+    double kernel_size_width = 1.0/(0.5*static_cast<double>(target_sz.width)*1.4142+1);
+    double kernel_size_height = 1.0/(0.5*static_cast<double>(target_sz.height)*1.4142+1);
+
+    cv::Mat kernel_weight = Mat::zeros(1 + cvFloor(y2 - y1) , 1+cvFloor(-(x1-cx) + (x2-cx)), CV_64FC1);
+    for (int y = y1; y < y2+1; ++y){
+        double * weightPtr = kernel_weight.ptr<double>(y);
+        double tmp_y = std::pow((cy-y)*kernel_size_height, 2);
+        for (int x = x1; x < x2+1; ++x){
+            weightPtr[x] = kernel_epan(std::pow((cx-x)*kernel_size_width,2) + tmp_y);
+        }
+    }
+
+    double max_val;
+    cv::minMaxLoc(kernel_weight, NULL, &max_val, NULL, NULL);
+    Mat fg_prior = kernel_weight / max_val;
+    fg_prior.setTo(0.5, fg_prior < 0.5);
+    fg_prior.setTo(0.9, fg_prior > 0.9);
+    return fg_prior;
+}
+
+Mat TrackerCSRTImpl::segment_region( const Mat &image, const Size &target_size, float scale_factor)
+{
+    Rect valid_pixels;
+    Mat patch = get_subwindow(image, object_center, cvFloor(scale_factor * template_size.width),
+        cvFloor(scale_factor * template_size.height), &valid_pixels);
+    Size2f scaled_target = Size2f(target_size.width * scale_factor,
+            target_size.height * scale_factor);
+    Mat fg_prior = get_location_prior(
+            Rect(0,0, patch.size().width, patch.size().height),
+            scaled_target , patch.size());
+
+    std::vector<Mat> img_channels;
+    split(patch, img_channels);
+    std::pair<Mat, Mat> probs = Segment::computePosteriors2(img_channels, 0, 0, patch.cols, patch.rows,
+                    p_b, fg_prior, 1.0-fg_prior, hist_foreground, hist_background);
+
+    Mat mask = Mat::zeros(probs.first.size(), probs.first.type());
+    probs.first(valid_pixels).copyTo(mask(valid_pixels));
+    double max_resp = get_max(mask);
+    threshold(mask, mask, max_resp / 2.0, 1, THRESH_BINARY);
+    mask.convertTo(mask, CV_32FC1, 1.0);
+    return mask;
+}
+
+
+void TrackerCSRTImpl::extract_histograms(const Mat &image, cv::Rect region, Histogram &hf, Histogram &hb)
+{
+    // get coordinates of the region
+    int x1 = std::min(std::max(0, region.x), image.cols-1);
+    int y1 = std::min(std::max(0, region.y), image.rows-1);
+    int x2 = std::min(std::max(0, region.x + region.width), image.cols-1);
+    int y2 = std::min(std::max(0, region.y + region.height), image.rows-1);
+
+    // calculate coordinates of the background region
+    int offsetX = (x2-x1+1) / params.background_ratio;
+    int offsetY = (y2-y1+1) / params.background_ratio;
+    int outer_y1 = std::max(0, (int)(y1-offsetY));
+    int outer_y2 = std::min(image.rows, (int)(y2+offsetY+1));
+    int outer_x1 = std::max(0, (int)(x1-offsetX));
+    int outer_x2 = std::min(image.cols, (int)(x2+offsetX+1));
+
+    // calculate probability for the background
+    p_b = 1.0 - ((x2-x1+1) * (y2-y1+1)) /
+        ((double) (outer_x2-outer_x1+1) * (outer_y2-outer_y1+1));
+
+    // split multi-channel image into the std::vector of matrices
+    std::vector<Mat> img_channels(image.channels());
+    split(image, img_channels);
+    for(size_t k=0; k<img_channels.size(); k++) {
+        img_channels.at(k).convertTo(img_channels.at(k), CV_8UC1);
+    }
+
+    hf.extractForegroundHistogram(img_channels, Mat(), false, x1, y1, x2, y2);
+    hb.extractBackGroundHistogram(img_channels, x1, y1, x2, y2,
+        outer_x1, outer_y1, outer_x2, outer_y2);
+    std::vector<Mat>().swap(img_channels);
+}
+
+void TrackerCSRTImpl::update_histograms(const Mat &image, const Rect &region)
+{
+    // create temporary histograms
+    Histogram hf(image.channels(), params.histogram_bins);
+    Histogram hb(image.channels(), params.histogram_bins);
+    extract_histograms(image, region, hf, hb);
+
+    // get histogram vectors from temporary histograms
+    std::vector<double> hf_vect_new = hf.getHistogramVector();
+    std::vector<double> hb_vect_new = hb.getHistogramVector();
+    // get histogram vectors from learned histograms
+    std::vector<double> hf_vect = hist_foreground.getHistogramVector();
+    std::vector<double> hb_vect = hist_background.getHistogramVector();
+
+    // update histograms - use learning rate
+    for(size_t i=0; i<hf_vect.size(); i++) {
+        hf_vect_new[i] = (1-params.histogram_lr)*hf_vect[i] +
+            params.histogram_lr*hf_vect_new[i];
+        hb_vect_new[i] = (1-params.histogram_lr)*hb_vect[i] +
+            params.histogram_lr*hb_vect_new[i];
+    }
+
+    // set learned histograms
+    hist_foreground.setHistogramVector(&hf_vect_new[0]);
+    hist_background.setHistogramVector(&hb_vect_new[0]);
+
+    std::vector<double>().swap(hf_vect);
+    std::vector<double>().swap(hb_vect);
+}
+
+Point2f TrackerCSRTImpl::estimate_new_position(const Mat &image)
+{
+
+    Mat resp = calculate_response(image, csr_filter);
+
+    Point max_loc;
+    minMaxLoc(resp, NULL, &last_score, NULL, &max_loc);
+    if (last_score < params.psr_threshold)
+        return Point2f(-1,-1); // target "lost"
+
+    // take into account also subpixel accuracy
+    float col = ((float) max_loc.x) + subpixel_peak(resp, "horizontal", max_loc);
+    float row = ((float) max_loc.y) + subpixel_peak(resp, "vertical", max_loc);
+    if(row + 1 > (float)resp.rows / 2.0f) {
+        row = row - resp.rows;
+    }
+    if(col + 1 > (float)resp.cols / 2.0f) {
+        col = col - resp.cols;
+    }
+    // calculate x and y displacements
+    Point2f new_center = object_center + Point2f(current_scale_factor * (1.0f / rescale_ratio) *cell_size*(col),
+            current_scale_factor * (1.0f / rescale_ratio) *cell_size*(row));
+    //sanity checks
+    if(new_center.x < 0)
+        new_center.x = 0;
+    if(new_center.x >= image_size.width)
+        new_center.x = static_cast<float>(image_size.width - 1);
+    if(new_center.y < 0)
+        new_center.y = 0;
+    if(new_center.y >= image_size.height)
+        new_center.y = static_cast<float>(image_size.height - 1);
+
+    return new_center;
+}
+
+// *********************************************************************
+// *                        Update API function                        *
+// *********************************************************************
+bool TrackerCSRTImpl::update(InputArray image_, Rect& boundingBox)
+{
+    Mat image;
+    if(image_.channels() == 1)    //treat gray image as color image
+        cvtColor(image_, image, COLOR_GRAY2BGR);
+    else
+        image = image_.getMat();
+
+    object_center = estimate_new_position(image);
+    if (object_center.x < 0 && object_center.y < 0)
+        return false;
+
+    current_scale_factor = dsst.getScale(image, object_center);
+    //update bouding_box according to new scale and location
+    bounding_box.x = object_center.x - current_scale_factor * original_target_size.width / 2.0f;
+    bounding_box.y = object_center.y - current_scale_factor * original_target_size.height / 2.0f;
+    bounding_box.width = current_scale_factor * original_target_size.width;
+    bounding_box.height = current_scale_factor * original_target_size.height;
+
+    //update tracker
+    if(params.use_segmentation) {
+        Mat hsv_img = bgr2hsv(image);
+        update_histograms(hsv_img, bounding_box);
+        filter_mask = segment_region(hsv_img, original_target_size, current_scale_factor);
+        resize(filter_mask, filter_mask, yf.size(), 0, 0, INTER_NEAREST);
+        if(check_mask_area(filter_mask, default_mask_area)) {
+            dilate(filter_mask , filter_mask, erode_element);
+        } else {
+            filter_mask = default_mask;
+        }
+    } else {
+        filter_mask = default_mask;
+    }
+    update_csr_filter(image, filter_mask);
+    dsst.update(image, object_center);
+    boundingBox = bounding_box;
+    return true;
+}
+
+float TrackerCSRTImpl::getTrackingScore() const
+{
+    return static_cast<float>(last_score);
+}
+
+// *********************************************************************
+// *                        Init API function                          *
+// *********************************************************************
+void TrackerCSRTImpl::init(InputArray image_, const Rect& boundingBox)
+{
+    Mat image;
+    if(image_.channels() == 1)    //treat gray image as color image
+        cvtColor(image_, image, COLOR_GRAY2BGR);
+    else
+        image = image_.getMat();
+
+    current_scale_factor = 1.0;
+    image_size = image.size();
+    bounding_box = boundingBox;
+    cell_size = cvFloor(std::min(4.0, std::max(1.0, static_cast<double>(
+        cvCeil((bounding_box.width * bounding_box.height)/400.0)))));
+    original_target_size = Size(bounding_box.size());
+
+    template_size.width = static_cast<float>(cvFloor(original_target_size.width + params.padding *
+            sqrt(original_target_size.width * original_target_size.height)));
+    template_size.height = static_cast<float>(cvFloor(original_target_size.height + params.padding *
+            sqrt(original_target_size.width * original_target_size.height)));
+    template_size.width = template_size.height =
+        (template_size.width + template_size.height) / 2.0f;
+    rescale_ratio = sqrt((params.template_size * params.template_size) / (template_size.width * template_size.height));
+    if(rescale_ratio > 1)  {
+        rescale_ratio = 1;
+    }
+    rescaled_template_size = Size2i(cvFloor(template_size.width * rescale_ratio),
+            cvFloor(template_size.height * rescale_ratio));
+    object_center = Point2f(static_cast<float>(boundingBox.x) + original_target_size.width / 2.0f,
+            static_cast<float>(boundingBox.y) + original_target_size.height / 2.0f);
+
+    yf = gaussian_shaped_labels(params.gsl_sigma,
+            rescaled_template_size.width / cell_size, rescaled_template_size.height / cell_size);
+    if(params.window_function.compare("hann") == 0) {
+        window = get_hann_win(Size(yf.cols,yf.rows));
+    } else if(params.window_function.compare("cheb") == 0) {
+        window = get_chebyshev_win(Size(yf.cols,yf.rows), params.cheb_attenuation);
+    } else if(params.window_function.compare("kaiser") == 0) {
+        window = get_kaiser_win(Size(yf.cols,yf.rows), params.kaiser_alpha);
+    } else {
+        CV_Error(Error::StsBadArg, "Not a valid window function");
+    }
+
+    Size2i scaled_obj_size = Size2i(cvFloor(original_target_size.width * rescale_ratio / cell_size),
+            cvFloor(original_target_size.height * rescale_ratio / cell_size));
+    //set dummy mask and area;
+    int x0 = std::max((yf.size().width - scaled_obj_size.width)/2 - 1, 0);
+    int y0 = std::max((yf.size().height - scaled_obj_size.height)/2 - 1, 0);
+    default_mask = Mat::zeros(yf.size(), CV_32FC1);
+    default_mask(Rect(x0,y0,scaled_obj_size.width, scaled_obj_size.height)) = 1.0f;
+    default_mask_area = static_cast<float>(sum(default_mask)[0]);
+
+    //initalize segmentation
+    if(params.use_segmentation) {
+        Mat hsv_img = bgr2hsv(image);
+        hist_foreground = Histogram(hsv_img.channels(), params.histogram_bins);
+        hist_background = Histogram(hsv_img.channels(), params.histogram_bins);
+        extract_histograms(hsv_img, bounding_box, hist_foreground, hist_background);
+        filter_mask = segment_region(hsv_img, original_target_size, current_scale_factor);
+        //update calculated mask with preset mask
+        if(preset_mask.data){
+            Mat preset_mask_padded = Mat::zeros(filter_mask.size(), filter_mask.type());
+            int sx = std::max((int)cvFloor(preset_mask_padded.cols / 2.0f - preset_mask.cols / 2.0f) - 1, 0);
+            int sy = std::max((int)cvFloor(preset_mask_padded.rows / 2.0f - preset_mask.rows / 2.0f) - 1, 0);
+            preset_mask.copyTo(preset_mask_padded(
+                        Rect(sx, sy, preset_mask.cols, preset_mask.rows)));
+            filter_mask = filter_mask.mul(preset_mask_padded);
+        }
+        erode_element = getStructuringElement(MORPH_ELLIPSE, Size(3,3), Point(1,1));
+        resize(filter_mask, filter_mask, yf.size(), 0, 0, INTER_NEAREST);
+        if(check_mask_area(filter_mask, default_mask_area)) {
+            dilate(filter_mask , filter_mask, erode_element);
+        } else {
+            filter_mask = default_mask;
+        }
+
+    } else {
+        filter_mask = default_mask;
+    }
+
+    //initialize filter
+    Mat patch = get_subwindow(image, object_center, cvFloor(current_scale_factor * template_size.width),
+        cvFloor(current_scale_factor * template_size.height));
+    resize(patch, patch, rescaled_template_size, 0, 0, INTER_CUBIC);
+    std::vector<Mat> patch_ftrs = get_features(patch, yf.size());
+    std::vector<Mat> Fftrs = fourier_transform_features(patch_ftrs);
+    csr_filter = create_csr_filter(Fftrs, yf, filter_mask);
+
+    if(params.use_channel_weights) {
+        Mat current_resp;
+        filter_weights = std::vector<float>(csr_filter.size());
+        float chw_sum = 0;
+        for (size_t i = 0; i < csr_filter.size(); ++i) {
+            mulSpectrums(Fftrs[i], csr_filter[i], current_resp, 0, true);
+            idft(current_resp, current_resp, DFT_SCALE | DFT_REAL_OUTPUT);
+            double max_val;
+            minMaxLoc(current_resp, NULL, &max_val, NULL , NULL);
+            chw_sum += static_cast<float>(max_val);
+            filter_weights[i] = static_cast<float>(max_val);
+        }
+        for (size_t i = 0; i < filter_weights.size(); ++i) {
+            filter_weights[i] /= chw_sum;
+        }
+    }
+
+    //initialize scale search
+    dsst = DSST(image, bounding_box, template_size, params.number_of_scales, params.scale_step,
+            params.scale_model_max_area, params.scale_sigma_factor, params.scale_lr);
+
+    model=makePtr<TrackerCSRTModel>();
+}
+
+}}}  // cv::detail::tracking
+
+//==============================================================================
+
+namespace cv {
+
+TrackerCSRT::Params::Params()
+{
+    use_channel_weights = true;
+    use_segmentation = true;
+    use_hog = true;
+    use_color_names = true;
+    use_gray = true;
+    use_rgb = false;
+    window_function = "hann";
+    kaiser_alpha = 3.75f;
+    cheb_attenuation = 45;
+    padding = 3.0f;
+    template_size = 200;
+    gsl_sigma = 1.0f;
+    hog_orientations = 9;
+    hog_clip = 0.2f;
+    num_hog_channels_used = 18;
+    filter_lr = 0.02f;
+    weights_lr = 0.02f;
+    admm_iterations = 4;
+    number_of_scales = 33;
+    scale_sigma_factor = 0.250f;
+    scale_model_max_area = 512.0f;
+    scale_lr = 0.025f;
+    scale_step = 1.020f;
+    histogram_bins = 16;
+    background_ratio = 2;
+    histogram_lr = 0.04f;
+    psr_threshold = 0.035f;
+}
+
+
+TrackerCSRT::TrackerCSRT() { }
+
+TrackerCSRT::~TrackerCSRT() { }
+
+Ptr<TrackerCSRT> TrackerCSRT::create(const TrackerCSRT::Params &parameters)
+{
+    return makePtr<cv::detail::tracking::TrackerCSRTImpl>(parameters);
+}
+
+}  // cv::
--- a/modules/video/src/tracking/tracker_csrt_scale_estimation.cpp
+++ b/modules/video/src/tracking/tracker_csrt_scale_estimation.cpp
@ -0,0 +1,203 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#include "tracker_csrt_scale_estimation.hpp"
+#include "tracker_csrt_utils.hpp"
+
+//Discriminative Scale Space Tracking
+namespace cv
+{
+
+class ParallelGetScaleFeatures : public ParallelLoopBody
+{
+public:
+    ParallelGetScaleFeatures(
+        Mat img_,
+        Point2f pos_,
+        Size2f base_target_sz_,
+        float current_scale_,
+        std::vector<float> &scale_factors_,
+        Mat scale_window_,
+        Size scale_model_sz_,
+        int col_len_,
+        Mat &result_)
+    {
+        this->img = img_;
+        this->pos = pos_;
+        this->base_target_sz = base_target_sz_;
+        this->current_scale = current_scale_;
+        this->scale_factors = scale_factors_;
+        this->scale_window = scale_window_;
+        this->scale_model_sz = scale_model_sz_;
+        this->col_len = col_len_;
+        this->result = result_;
+    }
+    virtual void operator ()(const Range& range) const CV_OVERRIDE
+    {
+        for (int s = range.start; s < range.end; s++) {
+            Size patch_sz = Size(static_cast<int>(current_scale * scale_factors[s] * base_target_sz.width),
+                    static_cast<int>(current_scale * scale_factors[s] * base_target_sz.height));
+            Mat img_patch = get_subwindow(img, pos, patch_sz.width, patch_sz.height);
+            img_patch.convertTo(img_patch, CV_32FC3);
+            resize(img_patch, img_patch, Size(scale_model_sz.width, scale_model_sz.height),0,0,INTER_LINEAR);
+            std::vector<Mat> hog;
+            hog = get_features_hog(img_patch, 4);
+            for (int i = 0; i < static_cast<int>(hog.size()); ++i) {
+                hog[i] = hog[i].t();
+                hog[i] = scale_window.at<float>(0,s) * hog[i].reshape(0, col_len);
+                hog[i].copyTo(result(Rect(Point(s, i*col_len), hog[i].size())));
+            }
+        }
+    }
+
+    ParallelGetScaleFeatures& operator=(const ParallelGetScaleFeatures &) {
+        return *this;
+    }
+
+private:
+    Mat img;
+    Point2f pos;
+    Size2f base_target_sz;
+    float current_scale;
+    std::vector<float> scale_factors;
+    Mat scale_window;
+    Size scale_model_sz;
+    int col_len;
+    Mat result;
+};
+
+
+DSST::DSST(const Mat &image,
+        Rect2f bounding_box,
+        Size2f template_size,
+        int numberOfScales,
+        float scaleStep,
+        float maxModelArea,
+        float sigmaFactor,
+        float scaleLearnRate):
+    scales_count(numberOfScales), scale_step(scaleStep), max_model_area(maxModelArea),
+    sigma_factor(sigmaFactor), learn_rate(scaleLearnRate)
+{
+    original_targ_sz = bounding_box.size();
+    Point2f object_center = Point2f(
+        bounding_box.x + static_cast<float>(original_targ_sz.width) / 2.f,
+        bounding_box.y + static_cast<float>(original_targ_sz.height) / 2.f
+    );
+
+    current_scale_factor = 1.0;
+    if(scales_count % 2 == 0)
+        scales_count++;
+
+    scale_sigma = static_cast<float>(sqrt(scales_count) * sigma_factor);
+
+    min_scale_factor = static_cast<float>(pow(scale_step,
+            cvCeil(log(max(5.0 / template_size.width, 5.0 / template_size.height)) / log(scale_step))));
+    max_scale_factor = static_cast<float>(pow(scale_step,
+            cvFloor(log(min((float)image.rows / (float)bounding_box.width,
+            (float)image.cols / (float)bounding_box.height)) / log(scale_step))));
+    ys = Mat(1, scales_count, CV_32FC1);
+    float ss, sf;
+    for(int i = 0; i < ys.cols; ++i) {
+        ss = (float)(i+1) - cvCeil((float)scales_count / 2.0f);
+        ys.at<float>(0,i) = static_cast<float>(exp(-0.5 * pow(ss,2) / pow(scale_sigma,2)));
+        sf = static_cast<float>(i + 1);
+        scale_factors.push_back(pow(scale_step, cvCeil((float)scales_count / 2.0f) - sf));
+    }
+
+    scale_window = get_hann_win(Size(scales_count, 1));
+
+    float scale_model_factor = 1.0;
+    if(template_size.width * template_size.height * pow(scale_model_factor, 2) > max_model_area)
+    {
+        scale_model_factor = sqrt(max_model_area /
+                (template_size.width * template_size.height));
+    }
+    scale_model_sz = Size(cvFloor(template_size.width * scale_model_factor),
+            cvFloor(template_size.height * scale_model_factor));
+
+    Mat scale_resp = get_scale_features(image, object_center, original_targ_sz, current_scale_factor);
+
+    Mat ysf_row = Mat(ys.size(), CV_32FC2);
+    dft(ys, ysf_row, DFT_ROWS | DFT_COMPLEX_OUTPUT, 0);
+    ysf = repeat(ysf_row, scale_resp.rows, 1);
+    Mat Fscale_resp;
+    dft(scale_resp, Fscale_resp, DFT_ROWS | DFT_COMPLEX_OUTPUT);
+    mulSpectrums(ysf, Fscale_resp, sf_num, 0 , true);
+    Mat sf_den_all;
+    mulSpectrums(Fscale_resp, Fscale_resp, sf_den_all, 0, true);
+    reduce(sf_den_all, sf_den, 0, REDUCE_SUM, -1);
+}
+
+DSST::~DSST()
+{
+}
+
+Mat DSST::get_scale_features( Mat img, Point2f pos, Size2f base_target_sz, float current_scale)
+{
+    Mat result;
+    int col_len = 0;
+    Size patch_sz = Size(cvFloor(current_scale * scale_factors[0] * base_target_sz.width),
+            cvFloor(current_scale * scale_factors[0] * base_target_sz.height));
+    Mat img_patch = get_subwindow(img, pos, patch_sz.width, patch_sz.height);
+    img_patch.convertTo(img_patch, CV_32FC3);
+    resize(img_patch, img_patch, Size(scale_model_sz.width, scale_model_sz.height),0,0,INTER_LINEAR);
+    std::vector<Mat> hog;
+    hog = get_features_hog(img_patch, 4);
+    result = Mat(Size((int)scale_factors.size(), hog[0].cols * hog[0].rows * (int)hog.size()), CV_32F);
+    col_len = hog[0].cols * hog[0].rows;
+    for (int i = 0; i < static_cast<int>(hog.size()); ++i) {
+        hog[i] = hog[i].t();
+        hog[i] = scale_window.at<float>(0,0) * hog[i].reshape(0, col_len);
+        hog[i].copyTo(result(Rect(Point(0, i*col_len), hog[i].size())));
+    }
+
+    ParallelGetScaleFeatures parallelGetScaleFeatures(img, pos, base_target_sz,
+            current_scale, scale_factors, scale_window, scale_model_sz, col_len, result);
+    parallel_for_(Range(1, static_cast<int>(scale_factors.size())), parallelGetScaleFeatures);
+    return result;
+}
+
+void DSST::update(const Mat &image, const Point2f object_center)
+{
+    Mat scale_features = get_scale_features(image, object_center, original_targ_sz, current_scale_factor);
+    Mat Fscale_features;
+    dft(scale_features, Fscale_features, DFT_ROWS | DFT_COMPLEX_OUTPUT);
+    Mat new_sf_num;
+    Mat new_sf_den;
+    Mat new_sf_den_all;
+    mulSpectrums(ysf, Fscale_features, new_sf_num, DFT_ROWS, true);
+    Mat sf_den_all;
+    mulSpectrums(Fscale_features, Fscale_features, new_sf_den_all, DFT_ROWS, true);
+    reduce(new_sf_den_all, new_sf_den, 0, REDUCE_SUM, -1);
+
+    sf_num = (1 - learn_rate) * sf_num + learn_rate * new_sf_num;
+    sf_den = (1 - learn_rate) * sf_den + learn_rate * new_sf_den;
+}
+
+float DSST::getScale(const Mat &image, const Point2f object_center)
+{
+    Mat scale_features = get_scale_features(image, object_center, original_targ_sz, current_scale_factor);
+
+    Mat Fscale_features;
+    dft(scale_features, Fscale_features, DFT_ROWS | DFT_COMPLEX_OUTPUT);
+
+    mulSpectrums(Fscale_features, sf_num, Fscale_features, 0, false);
+    Mat scale_resp;
+    reduce(Fscale_features, scale_resp, 0, REDUCE_SUM, -1);
+    scale_resp = divide_complex_matrices(scale_resp, sf_den + 0.01f);
+    idft(scale_resp, scale_resp, DFT_REAL_OUTPUT|DFT_SCALE);
+    Point max_loc;
+    minMaxLoc(scale_resp, NULL, NULL, NULL, &max_loc);
+
+    current_scale_factor *= scale_factors[max_loc.x];
+    if(current_scale_factor < min_scale_factor)
+        current_scale_factor = min_scale_factor;
+    else if(current_scale_factor > max_scale_factor)
+        current_scale_factor = max_scale_factor;
+
+    return current_scale_factor;
+}
+} /* namespace cv */
--- a/modules/video/src/tracking/tracker_csrt_scale_estimation.hpp
+++ b/modules/video/src/tracking/tracker_csrt_scale_estimation.hpp
@ -0,0 +1,46 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_TRACKER_CSRT_SCALE_ESTIMATION
+#define OPENCV_TRACKER_CSRT_SCALE_ESTIMATION
+
+#include "opencv2/core/mat.hpp"
+
+namespace cv
+{
+
+class DSST {
+public:
+    DSST() {};
+    DSST(const Mat &image, Rect2f bounding_box, Size2f template_size, int numberOfScales,
+            float scaleStep, float maxModelArea, float sigmaFactor, float scaleLearnRate);
+    ~DSST();
+    void update(const Mat &image, const Point2f objectCenter);
+    float getScale(const Mat &image, const Point2f objecCenter);
+private:
+    Mat get_scale_features(Mat img, Point2f pos, Size2f base_target_sz, float current_scale);
+
+    Size scale_model_sz;
+    Mat ys;
+    Mat ysf;
+    Mat scale_window;
+    std::vector<float> scale_factors;
+    Mat sf_num;
+    Mat sf_den;
+    float scale_sigma;
+    float min_scale_factor;
+    float max_scale_factor;
+    float current_scale_factor;
+    int scales_count;
+    float scale_step;
+    float max_model_area;
+    float sigma_factor;
+    float learn_rate;
+
+    Size original_targ_sz;
+};
+
+} /* namespace cv */
+
+#endif
--- a/modules/video/src/tracking/tracker_csrt_segmentation.cpp
+++ b/modules/video/src/tracking/tracker_csrt_segmentation.cpp
@ -0,0 +1,450 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#include "tracker_csrt_segmentation.hpp"
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <iostream>
+
+//-------------------- HISTOGRAM CLASS --------------------
+namespace cv
+{
+
+Histogram::Histogram(int numDimensions, int numBinsPerDimension)
+{
+    m_numBinsPerDim = numBinsPerDimension;
+    m_numDim = numDimensions;
+    p_size = cvFloor(std::pow(m_numBinsPerDim, m_numDim));
+    p_bins.resize(p_size, 0);
+    p_dimIdCoef.resize(m_numDim, 1);
+    for (int i = 0; i < m_numDim-1; ++i)
+        p_dimIdCoef[i] = static_cast<int>(std::pow(numBinsPerDimension, m_numDim - 1 - i));
+
+}
+
+void Histogram::extractForegroundHistogram(std::vector<cv::Mat> & imgChannels,
+        cv::Mat weights, bool useMatWeights, int x1, int y1, int x2, int y2)
+{
+    //just for code clarity
+    cv::Mat & img = imgChannels[0];
+
+    if (!useMatWeights){
+        //weights are epanechnikov distr. with peek at the center of the image;
+        double cx = x1 + (x2-x1)/2.;
+        double cy = y1 + (y2-y1)/2.;
+        double kernelSize_width = 1.0/(0.5*static_cast<double>(x2-x1)*1.4142+1);  //sqrt(2)
+        double kernelSize_height = 1.0/(0.5*static_cast<double>(y2-y1)*1.4142+1);
+
+        cv::Mat kernelWeight(img.rows, img.cols, CV_64FC1);
+        for (int y = y1; y < y2+1; ++y){
+            double * weightPtr = kernelWeight.ptr<double>(y);
+            double tmp_y = std::pow((cy-y)*kernelSize_height, 2);
+            for (int x = x1; x < x2+1; ++x){
+                weightPtr[x] = kernelProfile_Epanechnikov(std::pow((cx-x)*kernelSize_width,2) + tmp_y);
+            }
+        }
+        weights = kernelWeight;
+    }
+    //extract pixel values and compute histogram
+    double rangePerBinInverse = static_cast<double>(m_numBinsPerDim)/256.0;  // 1 / (imgRange/numBinsPerDim)
+    double sum = 0;
+    for (int y = y1; y < y2+1; ++y){
+        std::vector<const uchar *> dataPtr(m_numDim);
+        for (int dim = 0; dim < m_numDim; ++dim)
+            dataPtr[dim] = imgChannels[dim].ptr<uchar>(y);
+        const double * weightPtr = weights.ptr<double>(y);
+
+        for (int x = x1; x < x2+1; ++x){
+            int id = 0;
+            for (int dim = 0; dim < m_numDim; ++dim){
+                id += p_dimIdCoef[dim]*cvFloor(rangePerBinInverse*dataPtr[dim][x]);
+            }
+            p_bins[id] += weightPtr[x];
+            sum += weightPtr[x];
+        }
+    }
+    //normalize
+    sum = 1./sum;
+    for(int i = 0; i < p_size; ++i)
+        p_bins[i] *= sum;
+}
+
+void Histogram::extractBackGroundHistogram(
+        std::vector<cv::Mat> & imgChannels,
+        int x1, int y1, int x2, int y2,
+        int outer_x1, int outer_y1, int outer_x2, int outer_y2)
+{
+    //extract pixel values and compute histogram
+    double rangePerBinInverse = static_cast<double>(m_numBinsPerDim)/256.0;  // 1 / (imgRange/numBinsPerDim)
+    double sum = 0;
+    for (int y = outer_y1; y < outer_y2; ++y){
+
+        std::vector<const uchar *> dataPtr(m_numDim);
+        for (int dim = 0; dim < m_numDim; ++dim)
+            dataPtr[dim] = imgChannels[dim].ptr<uchar>(y);
+
+        for (int x = outer_x1; x < outer_x2; ++x){
+            if (x >= x1 && x <= x2 && y >= y1 && y <= y2)
+                continue;
+
+            int id = 0;
+            for (int dim = 0; dim < m_numDim; ++dim){
+                id += p_dimIdCoef[dim]*cvFloor(rangePerBinInverse*dataPtr[dim][x]);
+            }
+            p_bins[id] += 1.0;
+            sum += 1.0;
+        }
+    }
+    //normalize
+    sum = 1./sum;
+    for(int i = 0; i < p_size; ++i)
+        p_bins[i] *= sum;
+}
+
+cv::Mat Histogram::backProject(std::vector<cv::Mat> & imgChannels)
+{
+    //just for code clarity
+    cv::Mat & img = imgChannels[0];
+
+    cv::Mat backProject(img.rows, img.cols, CV_64FC1);
+    double rangePerBinInverse = static_cast<double>(m_numBinsPerDim)/256.0;  // 1 / (imgRange/numBinsPerDim)
+
+    for (int y = 0; y < img.rows; ++y){
+        double * backProjectPtr = backProject.ptr<double>(y);
+        std::vector<const uchar *> dataPtr(m_numDim);
+        for (int dim = 0; dim < m_numDim; ++dim)
+            dataPtr[dim] = imgChannels[dim].ptr<uchar>(y);
+
+        for (int x = 0; x < img.cols; ++x){
+            int id = 0;
+            for (int dim = 0; dim < m_numDim; ++dim){
+                id += p_dimIdCoef[dim]*cvFloor(rangePerBinInverse*dataPtr[dim][x]);
+            }
+            backProjectPtr[x] = p_bins[id];
+        }
+    }
+    return backProject;
+}
+
+// add new methods
+std::vector<double> Histogram::getHistogramVector() {
+    return p_bins;
+}
+
+void Histogram::setHistogramVector(double *vector) {
+    for (size_t i=0; i<p_bins.size(); i++) {
+        p_bins[i] = vector[i];
+    }
+}
+
+//-------------------- SEGMENT CLASS --------------------
+std::pair<cv::Mat, cv::Mat> Segment::computePosteriors(
+        std::vector<cv::Mat> &imgChannels,
+        int x1, int y1, int x2, int y2,
+        cv::Mat weights, cv::Mat fgPrior, cv::Mat bgPrior,
+        const Histogram &fgHistPrior, int numBinsPerChannel)
+{
+    //preprocess and normalize all data
+    CV_Assert(imgChannels.size() > 0);
+
+    //fit target to the image
+    x1 = std::min(std::max(x1, 0), imgChannels[0].cols-1);
+    y1 = std::min(std::max(y1, 0), imgChannels[0].rows-1);
+    x2 = std::max(std::min(x2, imgChannels[0].cols-1), 0);
+    y2 = std::max(std::min(y2, imgChannels[0].rows-1), 0);
+
+    //enlarge bbox by 1/3 of its size for background area
+    int offsetX = (x2-x1)/3;
+    int offsetY = (y2-y1)/3;
+    int outer_y1 = std::max(0, (int)(y1-offsetY));
+    int outer_y2 = std::min(imgChannels[0].rows, (int)(y2+offsetY+1));
+    int outer_x1 = std::max(0, (int)(x1-offsetX));
+    int outer_x2 = std::min(imgChannels[0].cols, (int)(x2+offsetX+1));
+
+    //extract histogram from original data -> more pixels better representation of distr. by histograms
+    Histogram hist_target =
+        (fgHistPrior.m_numBinsPerDim == numBinsPerChannel && (size_t)fgHistPrior.m_numDim == imgChannels.size())
+        ? fgHistPrior : Histogram(static_cast<int>(imgChannels.size()), numBinsPerChannel);
+    Histogram hist_background(static_cast<int>(imgChannels.size()), numBinsPerChannel);
+    if (weights.cols == 0)
+        hist_target.extractForegroundHistogram(imgChannels, cv::Mat(), false, x1, y1, x2, y2);
+    else
+        hist_target.extractForegroundHistogram(imgChannels, weights, true, x1, y1, x2, y2);
+    hist_background.extractBackGroundHistogram(imgChannels, x1, y1, x2, y2,
+            outer_x1, outer_y1, outer_x2, outer_y2);
+
+    //compute resize factor so that the max area is 1000 (=avg. size ~ 32x32)
+    double factor = sqrt(1000.0/((x2-x1)*(y2-y1)));
+    if (factor > 1)
+        factor = 1.0;
+    cv::Size newSize(cvFloor((x2-x1)*factor), cvFloor((y2-y1)*factor));
+
+    //rescale input data
+    cv::Rect roiRect_inner = cv::Rect(x1, y1, x2-x1, y2-y1);
+    std::vector<cv::Mat> imgChannelsROI_inner(imgChannels.size());
+    for (size_t i = 0; i < imgChannels.size(); ++i)
+        cv::resize(imgChannels[i](roiRect_inner), imgChannelsROI_inner[i], newSize);
+
+    //initialize priors if there is no external source and rescale
+    cv::Mat fgPriorScaled;
+    if (fgPrior.cols == 0)
+        fgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(fgPrior(roiRect_inner), fgPriorScaled, newSize);
+    cv::Mat bgPriorScaled;
+    if (bgPrior.cols == 0)
+        bgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(bgPrior(roiRect_inner), bgPriorScaled, newSize);
+
+    //backproject pixels likelihood
+    cv::Mat foregroundLikelihood = hist_target.backProject(imgChannelsROI_inner).mul(fgPriorScaled);
+    cv::Mat backgroundLikelihood = hist_background.backProject(imgChannelsROI_inner).mul(bgPriorScaled);
+
+    double p_b = std::sqrt((std::pow(outer_x2-outer_x1, 2) + std::pow(outer_y2-outer_y1, 2)) /
+            (std::pow(x2-x1, 2) + std::pow(y2-y1, 2))) ;
+    double p_o = 1./(p_b + 1);
+
+    //convert likelihoods to posterior prob. (Bayes rule)
+    cv::Mat prob_o(newSize, foregroundLikelihood.type());
+    prob_o = p_o*foregroundLikelihood / (p_o*foregroundLikelihood + p_b*backgroundLikelihood);
+    cv::Mat prob_b = 1.0 - prob_o;
+
+    std::pair<cv::Mat, cv::Mat> sizedProbs = getRegularizedSegmentation(prob_o, prob_b, fgPriorScaled, bgPriorScaled);
+
+    //resize probs to original size
+    std::pair<cv::Mat, cv::Mat> probs;
+    cv::resize(sizedProbs.first, probs.first, cv::Size(roiRect_inner.width, roiRect_inner.height));
+    cv::resize(sizedProbs.second, probs.second, cv::Size(roiRect_inner.width, roiRect_inner.height));
+
+    return probs;
+}
+
+std::pair<cv::Mat, cv::Mat> Segment::computePosteriors2(
+    std::vector<cv::Mat> &imgChannels, int x1, int y1, int x2, int y2, double p_b,
+    cv::Mat fgPrior, cv::Mat bgPrior, Histogram hist_target, Histogram hist_background)
+{
+    //preprocess and normalize all data
+    CV_Assert(imgChannels.size() > 0);
+
+    //fit target to the image
+    x1 = std::min(std::max(x1, 0), imgChannels[0].cols-1);
+    y1 = std::min(std::max(y1, 0), imgChannels[0].rows-1);
+    x2 = std::max(std::min(x2, imgChannels[0].cols-1), 0);
+    y2 = std::max(std::min(y2, imgChannels[0].rows-1), 0);
+
+    // calculate width and height of the region
+    int w = x2 - x1 + 1;
+    int h = y2 - y1 + 1;
+    w = std::min(std::max(w, 1), imgChannels[0].cols);
+    h = std::min(std::max(h, 1), imgChannels[0].rows);
+
+    //double p_o = 1./(p_b + 1);
+    double p_o = 1. - p_b;
+
+    //compute resize factor so that the max area is 1000 (=avg. size ~ 32x32)
+    double factor = sqrt(1000.0/(w*h));
+    if (factor > 1)
+        factor = 1.0;
+    cv::Size newSize(cvFloor(w*factor), cvFloor(h*factor));
+
+    //rescale input data
+    cv::Rect roiRect_inner = cv::Rect(x1, y1, w, h);
+    std::vector<cv::Mat> imgChannelsROI_inner(imgChannels.size());
+    for (size_t i = 0; i < imgChannels.size(); ++i)
+        cv::resize(imgChannels[i](roiRect_inner), imgChannelsROI_inner[i], newSize);
+
+    //initialize priors if there is no external source and rescale
+    cv::Mat fgPriorScaled;
+    if (fgPrior.cols == 0)
+        fgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(fgPrior(roiRect_inner), fgPriorScaled, newSize);
+    cv::Mat bgPriorScaled;
+    if (bgPrior.cols == 0)
+        bgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(bgPrior(roiRect_inner), bgPriorScaled, newSize);
+
+    //backproject pixels likelihood
+    cv::Mat foregroundLikelihood = hist_target.backProject(imgChannelsROI_inner).mul(fgPriorScaled);
+    cv::Mat backgroundLikelihood = hist_background.backProject(imgChannelsROI_inner).mul(bgPriorScaled);
+
+    //convert likelihoods to posterior prob. (Bayes rule)
+    cv::Mat prob_o(newSize, foregroundLikelihood.type());
+    prob_o = p_o*foregroundLikelihood / (p_o*foregroundLikelihood + p_b*backgroundLikelihood);
+    cv::Mat prob_b = 1.0 - prob_o;
+
+    std::pair<cv::Mat, cv::Mat> sizedProbs = getRegularizedSegmentation(prob_o, prob_b,
+            fgPriorScaled, bgPriorScaled);
+    //std::pair<cv::Mat, cv::Mat> sizedProbs = std::pair<cv::Mat, cv::Mat>(prob_o, prob_b);
+
+    //resize probs to original size
+    std::pair<cv::Mat, cv::Mat> probs;
+    cv::resize(sizedProbs.first, probs.first, cv::Size(roiRect_inner.width, roiRect_inner.height));
+    cv::resize(sizedProbs.second, probs.second, cv::Size(roiRect_inner.width, roiRect_inner.height));
+
+    return probs;
+}
+
+std::pair<cv::Mat, cv::Mat> Segment::computePosteriors2(std::vector<cv::Mat> &imgChannels,
+        cv::Mat fgPrior, cv::Mat bgPrior, Histogram hist_target, Histogram hist_background)
+{
+    //preprocess and normalize all data
+    CV_Assert(imgChannels.size() > 0);
+
+    //fit target to the image
+    int x1 = 0;
+    int y1 = 0;
+    int x2 = imgChannels[0].cols-1;
+    int y2 = imgChannels[0].rows-1;
+
+    //compute resize factor so that we control the max area ~32^2
+    double factor = sqrt(1000./((x2-x1)*(y2-y1)));
+    //double factor = 1;
+    if (factor > 1)
+        factor = 1.0;
+    cv::Size newSize(cvFloor((x2-x1)*factor), cvFloor((y2-y1)*factor));
+
+    //rescale input data
+    cv::Rect roiRect_inner = cv::Rect(x1, y1, x2-x1+1, y2-y1+1);
+    std::vector<cv::Mat> imgChannelsROI_inner(imgChannels.size());
+    for (size_t i = 0; i < imgChannels.size(); ++i)
+        cv::resize(imgChannels[i](roiRect_inner), imgChannelsROI_inner[i], newSize);
+
+    //initialize priors if there is no external source and rescale
+    cv::Mat fgPriorScaled;
+    if (fgPrior.cols == 0)
+        fgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(fgPrior(roiRect_inner), fgPriorScaled, newSize);
+
+    cv::Mat bgPriorScaled;
+    if (bgPrior.cols == 0)
+        bgPriorScaled = 0.5*cv::Mat::ones(newSize, CV_64FC1);
+    else
+        cv::resize(bgPrior(roiRect_inner), bgPriorScaled, newSize);
+
+    //backproject pixels likelihood
+    cv::Mat foregroundLikelihood = hist_target.backProject(imgChannelsROI_inner).mul(fgPriorScaled);
+    cv::Mat backgroundLikelihood = hist_background.backProject(imgChannelsROI_inner).mul(bgPriorScaled);
+
+    //prior for posterior, relative to the number of pixels in bg and fg
+    double p_b = 5./3.;
+    double p_o = 1./(p_b + 1);
+
+    //convert likelihoods to posterior prob. (Bayes rule)
+    cv::Mat prob_o(newSize, foregroundLikelihood.type());
+    prob_o = p_o*foregroundLikelihood / (p_o*foregroundLikelihood + p_b*backgroundLikelihood);
+    cv::Mat prob_b = 1.0 - prob_o;
+
+    std::pair<cv::Mat, cv::Mat> sizedProbs = getRegularizedSegmentation(prob_o, prob_b, fgPriorScaled, bgPriorScaled);
+
+    //resize probs to original size
+    std::pair<cv::Mat, cv::Mat> probs;
+    cv::resize(sizedProbs.first, probs.first, cv::Size(roiRect_inner.width, roiRect_inner.height));
+    cv::resize(sizedProbs.second, probs.second, cv::Size(roiRect_inner.width, roiRect_inner.height));
+
+    return probs;
+}
+
+std::pair<cv::Mat, cv::Mat> Segment::getRegularizedSegmentation(
+        cv::Mat &prob_o, cv::Mat &prob_b, cv::Mat & prior_o, cv::Mat & prior_b)
+{
+    int hsize = cvFloor(std::max(1.0, (double)cvFloor(static_cast<double>(prob_b.cols)*3./50. + 0.5)));
+    int lambdaSize = hsize*2+1;
+
+    //compute gaussian kernel
+    cv::Mat lambda(lambdaSize, lambdaSize, CV_64FC1);
+    double std2 = std::pow(hsize/3.0, 2);
+    double sumLambda = 0.0;
+    for (int y = -hsize; y < hsize + 1; ++y){
+        double * lambdaPtr = lambda.ptr<double>(y+hsize);
+        double tmp_y = y*y;
+        for (int x = -hsize; x < hsize +1; ++x){
+            double tmp_gauss = gaussian(x*x, tmp_y, std2);
+            lambdaPtr[x+hsize] = tmp_gauss;
+            sumLambda += tmp_gauss;
+        }
+    }
+    sumLambda -= lambda.at<double>(hsize, hsize);
+    //set center of kernel to 0
+    lambda.at<double>(hsize, hsize) = 0.0;
+    sumLambda = 1.0/sumLambda;
+    //normalize kernel to sum to 1
+    lambda = lambda*sumLambda;
+
+    //create lambda2 kernel
+    cv::Mat lambda2 = lambda.clone();
+    lambda2.at<double>(hsize, hsize) = 1.0;
+
+    double terminateThr = 1e-1;
+    double logLike = std::numeric_limits<double>::max();
+    int maxIter = 50;
+
+    //return values
+    cv::Mat Qsum_o(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Qsum_b(prior_o.rows, prior_o.cols, prior_o.type());
+
+    //algorithm temporal
+    cv::Mat Si_o(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Si_b(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Ssum_o(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Ssum_b(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Qi_o(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat Qi_b(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat logQo(prior_o.rows, prior_o.cols, prior_o.type());
+    cv::Mat logQb(prior_o.rows, prior_o.cols, prior_o.type());
+
+    int i;
+    for (i = 0; i < maxIter; ++i){
+        //follows the equations from Kristan et al. ACCV2014 paper
+        //"A graphical model for rapid obstacle image-map estimation from unmanned surface vehicles"
+        cv::Mat P_Io = prior_o.mul(prob_o) + std::numeric_limits<double>::epsilon();
+        cv::Mat P_Ib = prior_b.mul(prob_b) + std::numeric_limits<double>::epsilon();
+
+        cv::filter2D(prior_o, Si_o, -1, lambda, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        cv::filter2D(prior_b, Si_b, -1, lambda, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        Si_o = Si_o.mul(prior_o);
+        Si_b = Si_b.mul(prior_b);
+        cv::Mat normSi = 1.0/(Si_o + Si_b);
+        Si_o = Si_o.mul(normSi);
+        Si_b = Si_b.mul(normSi);
+        cv::filter2D(Si_o, Ssum_o, -1, lambda2, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        cv::filter2D(Si_b, Ssum_b, -1, lambda2, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+
+        cv::filter2D(P_Io, Qi_o, -1, lambda, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        cv::filter2D(P_Ib, Qi_b, -1, lambda, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        Qi_o = Qi_o.mul(P_Io);
+        Qi_b = Qi_b.mul(P_Ib);
+        cv::Mat normQi = 1.0/(Qi_o + Qi_b);
+        Qi_o = Qi_o.mul(normQi);
+        Qi_b = Qi_b.mul(normQi);
+        cv::filter2D(Qi_o, Qsum_o, -1, lambda2, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+        cv::filter2D(Qi_b, Qsum_b, -1, lambda2, cv::Point(-1, -1), 0, cv::BORDER_REFLECT);
+
+        prior_o = (Qsum_o + Ssum_o)*0.25;
+        prior_b = (Qsum_b + Ssum_b)*0.25;
+        cv::Mat normPI = 1.0/(prior_o + prior_b);
+        prior_o = prior_o.mul(normPI);
+        prior_b = prior_b.mul(normPI);
+
+        //converge ?
+        cv::log(Qsum_o, logQo);
+        cv::log(Qsum_b, logQb);
+        cv::Scalar mean = cv::sum(logQo+logQb);
+        double logLikeNew = -mean.val[0]/(2*Qsum_o.rows*Qsum_o.cols);
+        if (std::abs(logLike - logLikeNew) < terminateThr)
+            break;
+        logLike = logLikeNew;
+    }
+    return std::pair<cv::Mat, cv::Mat>(Qsum_o, Qsum_b);
+}
+
+} //cv namespace
+//---------------------------------------------------------------------------------------------------------------------
--- a/modules/video/src/tracking/tracker_csrt_segmentation.hpp
+++ b/modules/video/src/tracking/tracker_csrt_segmentation.hpp
@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_TRACKER_CSRT_SEGMENTATION
+#define OPENCV_TRACKER_CSRT_SEGMENTATION
+
+#include "opencv2/core/mat.hpp"
+
+namespace cv
+{
+class Histogram
+{
+public:
+    int m_numBinsPerDim;
+    int m_numDim;
+
+    Histogram() : m_numBinsPerDim(0), m_numDim(0) {}
+    Histogram(int numDimensions, int numBinsPerDimension = 8);
+    void extractForegroundHistogram(std::vector<cv::Mat> & imgChannels,
+            cv::Mat weights, bool useMatWeights, int x1, int y1, int x2, int y2);
+    void extractBackGroundHistogram(std::vector<cv::Mat> & imgChannels,
+            int x1, int y1, int x2, int y2, int outer_x1, int outer_y1,
+            int outer_x2, int outer_y2);
+    cv::Mat backProject(std::vector<cv::Mat> & imgChannels);
+    std::vector<double> getHistogramVector();
+    void setHistogramVector(double *vector);
+
+private:
+    int p_size;
+    std::vector<double> p_bins;
+    std::vector<int> p_dimIdCoef;
+
+    inline double kernelProfile_Epanechnikov(double x)
+        { return (x <= 1) ? (2.0/CV_PI)*(1-x) : 0; }
+};
+
+
+class Segment
+{
+public:
+    static std::pair<cv::Mat, cv::Mat> computePosteriors(std::vector<cv::Mat> & imgChannels,
+            int x1, int y1, int x2, int y2, cv::Mat weights, cv::Mat fgPrior,
+            cv::Mat bgPrior, const Histogram &fgHistPrior, int numBinsPerChannel = 16);
+    static std::pair<cv::Mat, cv::Mat> computePosteriors2(std::vector<cv::Mat> & imgChannels,
+            int x1, int y1, int x2, int y2, double p_b, cv::Mat fgPrior,
+            cv::Mat bgPrior, Histogram hist_target, Histogram hist_background);
+    static std::pair<cv::Mat, cv::Mat> computePosteriors2(std::vector<cv::Mat> &imgChannels,
+            cv::Mat fgPrior, cv::Mat bgPrior, Histogram hist_target, Histogram hist_background);
+
+private:
+    static std::pair<cv::Mat, cv::Mat> getRegularizedSegmentation(cv::Mat & prob_o,
+            cv::Mat & prob_b, cv::Mat &prior_o, cv::Mat &prior_b);
+
+    inline static double gaussian(double x2, double y2, double std2){
+        return exp(-(x2 + y2)/(2*std2))/(2*CV_PI*std2);
+    }
+};
+
+}//cv namespace
+#endif
--- a/modules/video/src/tracking/tracker_csrt_utils.cpp
+++ b/modules/video/src/tracking/tracker_csrt_utils.cpp
@ -0,0 +1,563 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#include "tracker_csrt_utils.hpp"
+
+namespace cv {
+
+Mat circshift(Mat matrix, int dx, int dy)
+{
+    Mat matrix_out = matrix.clone();
+    int idx_y = 0;
+    int idx_x = 0;
+    for(int i=0; i<matrix.rows; i++) {
+        for(int j=0; j<matrix.cols; j++) {
+            idx_y = modul(i+dy+1, matrix.rows);
+            idx_x = modul(j+dx+1, matrix.cols);
+            matrix_out.at<float>(idx_y, idx_x) = matrix.at<float>(i,j);
+        }
+    }
+    return matrix_out;
+}
+
+Mat gaussian_shaped_labels(const float sigma, const int w, const int h)
+{
+    // create 2D Gaussian peak, convert to Fourier space and stores it into the yf
+    Mat y = Mat::zeros(h, w, CV_32F);
+    float w2 = static_cast<float>(cvFloor(w / 2));
+    float h2 = static_cast<float>(cvFloor(h / 2));
+
+    // calculate for each pixel separatelly
+    for(int i=0; i<y.rows; i++) {
+        for(int j=0; j<y.cols; j++) {
+            y.at<float>(i,j) = (float)exp((-0.5 / pow(sigma, 2)) * (pow((i+1-h2), 2) + pow((j+1-w2), 2)));
+        }
+    }
+    // wrap-around with the circulat shifting
+    y = circshift(y, -cvFloor(y.cols / 2), -cvFloor(y.rows / 2));
+    Mat yf;
+    dft(y, yf, DFT_COMPLEX_OUTPUT);
+    return yf;
+}
+
+std::vector<Mat> fourier_transform_features(const std::vector<Mat> &M)
+{
+    std::vector<Mat> out(M.size());
+    Mat channel;
+    // iterate over channels and convert them to Fourier domain
+    for(size_t k = 0; k < M.size(); k++) {
+        M[k].convertTo(channel, CV_32F);
+        dft(channel, channel, DFT_COMPLEX_OUTPUT);
+        out[k] = (channel);
+    }
+    return out;
+}
+
+Mat divide_complex_matrices(const Mat &A, const Mat &B)
+{
+    std::vector<Mat> va,vb;
+    split(A, va);
+    split(B, vb);
+
+    Mat a = va.at(0);
+    Mat b = va.at(1);
+    Mat c = vb.at(0);
+    Mat d = vb.at(1);
+
+    Mat div = c.mul(c) + d.mul(d);
+    Mat real_part = (a.mul(c) + b.mul(d));
+    Mat im_part = (b.mul(c) - a.mul(d));
+    divide(real_part, div, real_part);
+    divide(im_part, div, im_part);
+
+    std::vector<Mat> tmp(2);
+    tmp[0] = real_part;
+    tmp[1] = im_part;
+    Mat res;
+    merge(tmp, res);
+    return res;
+}
+
+Mat get_subwindow(
+        const Mat &image,
+        const Point2f center,
+        const int w,
+        const int h,
+        Rect *valid_pixels)
+{
+    int startx = cvFloor(center.x) + 1 - (cvFloor(w/2));
+    int starty = cvFloor(center.y) + 1 - (cvFloor(h/2));
+    Rect roi(startx, starty, w, h);
+    int padding_left = 0, padding_right = 0, padding_top = 0, padding_bottom = 0;
+    if(roi.x < 0) {
+        padding_left = -roi.x;
+        roi.x = 0;
+    }
+    if(roi.y < 0) {
+        padding_top = -roi.y;
+        roi.y = 0;
+    }
+    roi.width -= padding_left;
+    roi.height-= padding_top;
+    if(roi.x + roi.width >= image.cols) {
+        padding_right = roi.x + roi.width - image.cols;
+        roi.width = image.cols - roi.x;
+    }
+    if(roi.y + roi.height >= image.rows) {
+        padding_bottom = roi.y + roi.height - image.rows;
+        roi.height = image.rows - roi.y;
+    }
+    Mat subwin = image(roi).clone();
+    copyMakeBorder(subwin, subwin, padding_top, padding_bottom, padding_left, padding_right, BORDER_REPLICATE);
+
+    if(valid_pixels != NULL) {
+        *valid_pixels = Rect(padding_left, padding_top, roi.width, roi.height);
+    }
+    return subwin;
+}
+
+float subpixel_peak(const Mat &response, const std::string &s, const Point2f &p)
+{
+    int i_p0, i_p_l, i_p_r;     // indexes in response
+    float p0, p_l, p_r;         // values in response
+
+    if(s.compare("vertical") == 0) {
+        // neighbouring rows
+        i_p0 = cvRound(p.y);
+        i_p_l = modul(cvRound(p.y) - 1, response.rows);
+        i_p_r = modul(cvRound(p.y) + 1, response.rows);
+        int px = static_cast<int>(p.x);
+        p0 = response.at<float>(i_p0, px);
+        p_l = response.at<float>(i_p_l, px);
+        p_r = response.at<float>(i_p_r, px);
+    } else if(s.compare("horizontal") == 0) {
+        // neighbouring cols
+        i_p0 = cvRound(p.x);
+        i_p_l = modul(cvRound(p.x) - 1, response.cols);
+        i_p_r = modul(cvRound(p.x) + 1, response.cols);
+        int py = static_cast<int>(p.y);
+        p0 = response.at<float>(py, i_p0);
+        p_l = response.at<float>(py, i_p_l);
+        p_r = response.at<float>(py, i_p_r);
+    } else {
+        std::cout << "Warning: unknown subpixel peak direction!" << std::endl;
+        return 0;
+    }
+    float delta = 0.5f * (p_r - p_l) / (2*p0 - p_r - p_l);
+    if(!std::isfinite(delta)) {
+        delta = 0;
+    }
+
+    return delta;
+}
+
+inline float chebpoly(const int n, const float x)
+{
+    float res;
+    if (fabs(x) <= 1)
+        res = cos(n*acos(x));
+    else
+        res = cosh(n*acosh(x));
+    return res;
+}
+
+static Mat chebwin(int N, const float atten)
+{
+    Mat out(N , 1, CV_32FC1);
+    int nn, i;
+    float M, n, sum = 0, max=0;
+    float tg = static_cast<float>(pow(10,atten/20.0f));  /* 1/r term [2], 10^gamma [2] */
+    float x0 = cosh((1.0f/(N-1))*acosh(tg));
+    M = (N-1)/2.0f;
+    if(N%2==0)
+        M = M + 0.5f; /* handle even length windows */
+    for(nn=0; nn<(N/2+1); nn++) {
+        n = nn-M;
+        sum = 0;
+        for(i=1; i<=M; i++){
+            sum += chebpoly(N-1,x0*static_cast<float>(cos(CV_PI*i/N))) *
+                static_cast<float>(cos(2.0f*n*CV_PI*i/N));
+        }
+        out.at<float>(nn,0) = tg + 2*sum;
+        out.at<float>(N-nn-1,0) = out.at<float>(nn,0) ;
+        if(out.at<float>(nn,0) > max)
+            max = out.at<float>(nn,0);
+    }
+    for(nn=0; nn<N; nn++)
+        out.at<float>(nn,0) /= max; /* normalize everything */
+
+    return out;
+}
+
+
+static double modified_bessel(int order, double x)
+{
+    //  sum m=0:inf 1/(m! * Gamma(m + order + 1)) * (x/2)^(2m + order)
+    const double eps = 1e-13;
+    double result = 0;
+    double m = 0;
+    double gamma = 1.0;
+    for(int i = 2; i <= order; ++i)
+        gamma *= i;
+    double term = pow(x,order) / (pow(2,order) * gamma);
+
+    while(term  > eps * result) {
+        result += term;
+        //calculate new term in series
+        ++m;
+        term *= (x*x) / (4*m*(m+order));
+    }
+    return result;
+}
+
+Mat get_hann_win(Size sz)
+{
+    Mat hann_rows = Mat::ones(sz.height, 1, CV_32F);
+    Mat hann_cols = Mat::ones(1, sz.width, CV_32F);
+    int NN = sz.height - 1;
+    if(NN != 0) {
+        for (int i = 0; i < hann_rows.rows; ++i) {
+            hann_rows.at<float>(i,0) = (float)(1.0/2.0 * (1.0 - cos(2*CV_PI*i/NN)));
+        }
+    }
+    NN = sz.width - 1;
+    if(NN != 0) {
+        for (int i = 0; i < hann_cols.cols; ++i) {
+            hann_cols.at<float>(0,i) = (float)(1.0/2.0 * (1.0 - cos(2*CV_PI*i/NN)));
+        }
+    }
+    return hann_rows * hann_cols;
+}
+
+Mat get_kaiser_win(Size sz, float alpha)
+{
+    Mat kaiser_rows = Mat::ones(sz.height, 1, CV_32F);
+    Mat kaiser_cols = Mat::ones(1, sz.width, CV_32F);
+
+    int N = sz.height - 1;
+    double shape = alpha;
+    double den = 1.0 / modified_bessel(0, shape);
+
+    for(int n = 0; n <= N; ++n) {
+        double K = (2.0 * n * 1.0/N) - 1.0;
+        double x = sqrt(1.0 - (K * K));
+        kaiser_rows.at<float>(n,0) = static_cast<float>(modified_bessel(0, shape * x) * den);
+    }
+
+    N = sz.width - 1;
+    for(int n = 0; n <= N; ++n) {
+        double K = (2.0 * n * 1.0/N) - 1.0;
+        double x = sqrt(1.0 - (K * K));
+        kaiser_cols.at<float>(0,n) = static_cast<float>(modified_bessel(0, shape * x) * den);
+    }
+
+    return kaiser_rows * kaiser_cols;
+}
+
+Mat get_chebyshev_win(Size sz, float attenuation)
+{
+    Mat cheb_rows = chebwin(sz.height, attenuation);
+    Mat cheb_cols = chebwin(sz.width, attenuation).t();
+    return cheb_rows * cheb_cols;
+}
+
+static void computeHOG32D(const Mat &imageM, Mat &featM, const int sbin, const int pad_x, const int pad_y)
+{
+    const int dimHOG = 32;
+    CV_Assert(pad_x >= 0);
+    CV_Assert(pad_y >= 0);
+    CV_Assert(imageM.channels() == 3);
+    CV_Assert(imageM.depth() == CV_64F);
+
+    // epsilon to avoid division by zero
+    const double eps = 0.0001;
+    // number of orientations
+    const int numOrient = 18;
+    // unit vectors to compute gradient orientation
+    const double uu[9] = {1.000, 0.9397, 0.7660, 0.5000, 0.1736, -0.1736, -0.5000, -0.7660, -0.9397};
+    const double vv[9] = {0.000, 0.3420, 0.6428, 0.8660, 0.9848,  0.9848,  0.8660,  0.6428,  0.3420};
+
+    // image size
+    const Size imageSize = imageM.size();
+    // block size
+    // int bW = cvRound((double)imageSize.width/(double)sbin);
+    // int bH = cvRound((double)imageSize.height/(double)sbin);
+    int bW = cvFloor((double)imageSize.width/(double)sbin);
+    int bH = cvFloor((double)imageSize.height/(double)sbin);
+    const Size blockSize(bW, bH);
+    // size of HOG features
+    int oW = max(blockSize.width-2, 0) + 2*pad_x;
+    int oH = max(blockSize.height-2, 0) + 2*pad_y;
+    Size outSize = Size(oW, oH);
+    // size of visible
+    const Size visible = blockSize*sbin;
+
+    // initialize historgram, norm, output feature matrices
+    Mat histM = Mat::zeros(Size(blockSize.width*numOrient, blockSize.height), CV_64F);
+    Mat normM = Mat::zeros(Size(blockSize.width, blockSize.height), CV_64F);
+    featM = Mat::zeros(Size(outSize.width*dimHOG, outSize.height), CV_64F);
+
+    // get the stride of each matrix
+    const size_t imStride = imageM.step1();
+    const size_t histStride = histM.step1();
+    const size_t normStride = normM.step1();
+    const size_t featStride = featM.step1();
+
+    // calculate the zero offset
+    const double* im = imageM.ptr<double>(0);
+    double* const hist = histM.ptr<double>(0);
+    double* const norm = normM.ptr<double>(0);
+    double* const feat = featM.ptr<double>(0);
+
+    for (int y = 1; y < visible.height - 1; y++)
+    {
+        for (int x = 1; x < visible.width - 1; x++)
+        {
+            // OpenCV uses an interleaved format: BGR-BGR-BGR
+            const double* s = im + 3*min(x, imageM.cols-2) + min(y, imageM.rows-2)*imStride;
+
+            // blue image channel
+            double dyb = *(s+imStride) - *(s-imStride);
+            double dxb = *(s+3) - *(s-3);
+            double vb = dxb*dxb + dyb*dyb;
+
+            // green image channel
+            s += 1;
+            double dyg = *(s+imStride) - *(s-imStride);
+            double dxg = *(s+3) - *(s-3);
+            double vg = dxg*dxg + dyg*dyg;
+
+            // red image channel
+            s += 1;
+            double dy = *(s+imStride) - *(s-imStride);
+            double dx = *(s+3) - *(s-3);
+            double v = dx*dx + dy*dy;
+
+            // pick the channel with the strongest gradient
+            if (vg > v) { v = vg; dx = dxg; dy = dyg; }
+            if (vb > v) { v = vb; dx = dxb; dy = dyb; }
+
+            // snap to one of the 18 orientations
+            double best_dot = 0;
+            int best_o = 0;
+            for (int o = 0; o < (int)numOrient/2; o++)
+            {
+                double dot =  uu[o]*dx + vv[o]*dy;
+                if (dot > best_dot)
+                {
+                    best_dot = dot;
+                    best_o = o;
+                }
+                else if (-dot > best_dot)
+                {
+                    best_dot = -dot;
+                    best_o = o + (int)(numOrient/2);
+                }
+            }
+
+            // add to 4 historgrams around pixel using bilinear interpolation
+            double yp =  ((double)y+0.5)/(double)sbin - 0.5;
+            double xp =  ((double)x+0.5)/(double)sbin - 0.5;
+            int iyp = (int)cvFloor(yp);
+            int ixp = (int)cvFloor(xp);
+            double vy0 = yp - iyp;
+            double vx0 = xp - ixp;
+            double vy1 = 1.0 - vy0;
+            double vx1 = 1.0 - vx0;
+            v = sqrt(v);
+
+            // fill the value into the 4 neighborhood cells
+            if (iyp >= 0 && ixp >= 0)
+                *(hist + iyp*histStride + ixp*numOrient + best_o) += vy1*vx1*v;
+
+            if (iyp >= 0 && ixp+1 < blockSize.width)
+                *(hist + iyp*histStride + (ixp+1)*numOrient + best_o) += vx0*vy1*v;
+
+            if (iyp+1 < blockSize.height && ixp >= 0)
+                *(hist + (iyp+1)*histStride + ixp*numOrient + best_o) += vy0*vx1*v;
+
+            if (iyp+1 < blockSize.height && ixp+1 < blockSize.width)
+                *(hist + (iyp+1)*histStride + (ixp+1)*numOrient + best_o) += vy0*vx0*v;
+
+        } // for y
+    } // for x
+
+    // compute the energy in each block by summing over orientation
+    for (int y = 0; y < blockSize.height; y++)
+    {
+        const double* src = hist + y*histStride;
+        double* dst = norm + y*normStride;
+        double const* const dst_end = dst + blockSize.width;
+        // for each cell
+        while (dst < dst_end)
+        {
+            *dst = 0;
+            for (int o = 0; o < (int)(numOrient/2); o++)
+            {
+                *dst += (*src + *(src + numOrient/2))*
+                    (*src + *(src + numOrient/2));
+                src++;
+            }
+            dst++;
+            src += numOrient/2;
+        }
+    }
+
+    // compute the features
+    for (int y = pad_y; y < outSize.height - pad_y; y++)
+    {
+        for (int x = pad_x; x < outSize.width - pad_x; x++)
+        {
+            double* dst = feat + y*featStride + x*dimHOG;
+            double* p, n1, n2, n3, n4;
+            const double* src;
+
+            p = norm + (y - pad_y + 1)*normStride + (x - pad_x + 1);
+            n1 = 1.0f / sqrt(*p + *(p + 1) + *(p + normStride) + *(p + normStride + 1) + eps);
+            p = norm + (y - pad_y)*normStride + (x - pad_x + 1);
+            n2 = 1.0f / sqrt(*p + *(p + 1) + *(p + normStride) + *(p + normStride + 1) + eps);
+            p = norm + (y- pad_y + 1)*normStride + x - pad_x;
+            n3 = 1.0f / sqrt(*p + *(p + 1) + *(p + normStride) + *(p + normStride + 1) + eps);
+            p = norm + (y - pad_y)*normStride + x - pad_x;
+            n4 = 1.0f / sqrt(*p + *(p + 1) + *(p + normStride) + *(p + normStride + 1) + eps);
+
+            double t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0;
+
+            // contrast-sesitive features
+            src = hist + (y - pad_y + 1)*histStride + (x - pad_x + 1)*numOrient;
+            for (int o = 0; o < numOrient; o++)
+            {
+                double val = *src;
+                double h1 = min(val*n1, 0.2);
+                double h2 = min(val*n2, 0.2);
+                double h3 = min(val*n3, 0.2);
+                double h4 = min(val*n4, 0.2);
+                *(dst++) = 0.5 * (h1 + h2 + h3 + h4);
+
+                src++;
+                t1 += h1;
+                t2 += h2;
+                t3 += h3;
+                t4 += h4;
+            }
+
+            // contrast-insensitive features
+            src =  hist + (y - pad_y + 1)*histStride + (x - pad_x + 1)*numOrient;
+            for (int o = 0; o < numOrient/2; o++)
+            {
+                double sum = *src + *(src + numOrient/2);
+                double h1 = min(sum * n1, 0.2);
+                double h2 = min(sum * n2, 0.2);
+                double h3 = min(sum * n3, 0.2);
+                double h4 = min(sum * n4, 0.2);
+                *(dst++) = 0.5 * (h1 + h2 + h3 + h4);
+                src++;
+            }
+
+            // texture features
+            *(dst++) = 0.2357 * t1;
+            *(dst++) = 0.2357 * t2;
+            *(dst++) = 0.2357 * t3;
+            *(dst++) = 0.2357 * t4;
+            // truncation feature
+            *dst = 0;
+        }// for x
+    }// for y
+    // Truncation features
+    for (int m = 0; m < featM.rows; m++)
+    {
+        for (int n = 0; n < featM.cols; n += dimHOG)
+        {
+            if (m > pad_y - 1 && m < featM.rows - pad_y && n > pad_x*dimHOG - 1 && n < featM.cols - pad_x*dimHOG)
+                continue;
+
+            featM.at<double>(m, n + dimHOG - 1) = 1;
+        } // for x
+    }// for y
+}
+
+std::vector<Mat> get_features_hog(const Mat &im, const int bin_size)
+{
+    Mat hogmatrix;
+    Mat im_;
+    im.convertTo(im_, CV_64FC3, 1.0/255.0);
+    computeHOG32D(im_,hogmatrix,bin_size,1,1);
+    hogmatrix.convertTo(hogmatrix, CV_32F);
+    Size hog_size = im.size();
+    hog_size.width /= bin_size;
+    hog_size.height /= bin_size;
+    Mat hogc(hog_size, CV_32FC(32), hogmatrix.data);
+    std::vector<Mat> features;
+    split(hogc, features);
+    return features;
+}
+
+// std::vector<Mat> get_features_cn(const Mat &ppatch_data, const Size &output_size) {
+//     Mat patch_data = ppatch_data.clone();
+//     Vec3b & pixel = patch_data.at<Vec3b>(0,0);
+//     unsigned index;
+
+//     Mat cnFeatures = Mat::zeros(patch_data.rows,patch_data.cols,CV_32FC(10));
+
+//     for(int i=0;i<patch_data.rows;i++){
+//         for(int j=0;j<patch_data.cols;j++){
+//             pixel=patch_data.at<Vec3b>(i,j);
+//             index=(unsigned)(cvFloor((float)pixel[2]/8)+32*cvFloor((float)pixel[1]/8)+32*32*cvFloor((float)pixel[0]/8));
+
+//             //copy the values
+//             for(int k=0;k<10;k++){
+//                 cnFeatures.at<Vec<float,10> >(i,j)[k]=(float)ColorNames[index][k];
+//             }
+//         }
+//     }
+//     std::vector<Mat> result;
+//     split(cnFeatures, result);
+//     for (size_t i = 0; i < result.size(); i++) {
+//         if (output_size.width > 0 && output_size.height > 0) {
+//             resize(result.at(i), result.at(i), output_size, INTER_CUBIC);
+//         }
+//     }
+//     return result;
+// }
+
+std::vector<Mat> get_features_rgb(const Mat &patch, const Size &output_size)
+{
+    std::vector<Mat> channels;
+    split(patch, channels);
+    for(size_t k=0; k<channels.size(); k++) {
+        channels[k].convertTo(channels[k], CV_32F, 1.0/255.0, -0.5);
+        channels[k] = channels[k] - mean(channels[k])[0];
+        resize(channels[k], channels[k], output_size, INTER_CUBIC);
+    }
+    return channels;
+}
+
+double get_max(const Mat &m)
+{
+    double val;
+    minMaxLoc(m, NULL, &val, NULL, NULL);
+    return val;
+}
+
+double get_min(const Mat &m)
+{
+    double val;
+    minMaxLoc(m, &val, NULL, NULL, NULL);
+    return val;
+}
+
+Mat bgr2hsv(const Mat &img)
+{
+    Mat hsv_img;
+    cvtColor(img, hsv_img, COLOR_BGR2HSV);
+    std::vector<Mat> hsv_img_channels;
+    split(hsv_img, hsv_img_channels);
+    hsv_img_channels.at(0).convertTo(hsv_img_channels.at(0), CV_8UC1, 255.0 / 180.0);
+    merge(hsv_img_channels, hsv_img);
+    return hsv_img;
+}
+
+} //cv namespace
--- a/modules/video/src/tracking/tracker_csrt_utils.hpp
+++ b/modules/video/src/tracking/tracker_csrt_utils.hpp
@ -0,0 +1,54 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_TRACKER_CSRT_UTILS
+#define OPENCV_TRACKER_CSRT_UTILS
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <iterator>
+
+#include "opencv2/core/mat.hpp"
+
+namespace cv
+{
+
+inline int modul(int a, int b)
+{
+    // function calculates the module of two numbers and it takes into account also negative numbers
+    return ((a % b) + b) % b;
+}
+
+inline double kernel_epan(double x)
+{
+    return (x <= 1) ? (2.0/3.14)*(1-x) : 0;
+}
+
+Mat circshift(Mat matrix, int dx, int dy);
+Mat gaussian_shaped_labels(const float sigma, const int w, const int h);
+std::vector<Mat> fourier_transform_features(const std::vector<Mat> &M);
+Mat divide_complex_matrices(const Mat &A, const Mat &B);
+Mat get_subwindow(const Mat &image, const Point2f center,
+        const int w, const int h,Rect *valid_pixels = NULL);
+
+float subpixel_peak(const Mat &response, const std::string &s, const Point2f &p);
+double get_max(const Mat &m);
+double get_min(const Mat &m);
+
+Mat get_hann_win(Size sz);
+Mat get_kaiser_win(Size sz, float alpha);
+Mat get_chebyshev_win(Size sz, float attenuation);
+
+std::vector<Mat> get_features_rgb(const Mat &patch, const Size &output_size);
+std::vector<Mat> get_features_hog(const Mat &im, const int bin_size);
+// std::vector<Mat> get_features_cn(const Mat &im, const Size &output_size);
+
+Mat bgr2hsv(const Mat &img);
+
+} //cv namespace
+
+#endif
--- a/modules/video/test/test_trackers.cpp
+++ b/modules/video/test/test_trackers.cpp
@ -38,12 +38,24 @@ TEST_P(DistanceAndOverlap, MIL)
    test.run(numFramesLimit);
 }

+TEST_P(DistanceAndOverlap, CSRT)
+{
+  TrackerTest<Tracker, Rect> test(TrackerCSRT::create(), dataset, 22, .7f, NoTransform);
+  test.run();
+}
+
 TEST_P(DistanceAndOverlap, Shifted_Data_MIL)
 {
    TrackerTest<Tracker, Rect> test(TrackerMIL::create(), dataset, 30, .6f, CenterShiftLeft);
    test.run(numFramesLimit);
 }

+TEST_P(DistanceAndOverlap, Shifted_Data_CSRT)
+{
+  TrackerTest<Tracker, Rect> test(TrackerCSRT::create(), dataset, 13, .69f, CenterShiftLeft);
+  test.run();
+}
+
 /***************************************************************************************/
 //Tests with scaled initial window

@ -53,6 +65,12 @@ TEST_P(DistanceAndOverlap, Scaled_Data_MIL)
    test.run(numFramesLimit);
 }

+TEST_P(DistanceAndOverlap, Scaled_Data_CSRT)
+{
+  TrackerTest<Tracker, Rect> test(TrackerCSRT::create(), dataset, 22, 0.69f, Scale_1_1, 1);
+  test.run();
+}
+
 TEST_P(DistanceAndOverlap, GOTURN)
 {
    std::string model = cvtest::findDataFile("dnn/gsoc2016-goturn/goturn.prototxt");