refactored gpu::matchTemplate (converted it into Algorithm)

2025-01-18 06:03:15 +08:00 · 2013-04-30 17:27:06 +04:00 · 2013-04-30 17:27:06 +04:00 · de56163f97
commit de56163f97
parent 1fcc8074bd
5 changed files with 392 additions and 156 deletions
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@ -424,20 +424,24 @@ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, i

 /////////////////////////// Match Template ////////////////////////////

-struct CV_EXPORTS MatchTemplateBuf
+//! computes the proximity map for the raster template and the image where the template is searched for
+class CV_EXPORTS TemplateMatching : public Algorithm
 {
-    Size user_block_size;
-    GpuMat imagef, templf;
-    std::vector<GpuMat> images;
-    std::vector<GpuMat> image_sums;
-    std::vector<GpuMat> image_sqsums;
+public:
+    virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
 };

-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null());
+CV_EXPORTS Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());

-//! computes the proximity map for the raster template and the image where the template is searched for
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null());
+// obsolete
+
+__OPENCV_GPUIMGPROC_DEPR_BEFORE__ void matchTemplate(InputArray image, InputArray templ, OutputArray result,
+                                                     int method, Stream& stream = Stream::Null()) __OPENCV_GPUIMGPROC_DEPR_AFTER__;
+
+inline void matchTemplate(InputArray image, InputArray templ, OutputArray result, int method, Stream& stream)
+{
+    gpu::createTemplateMatching(image.type(), method)->match(image, templ, result, stream);
+}

 ////////////////////////// Bilateral Filter ///////////////////////////

--- a/modules/gpuimgproc/perf/perf_match_template.cpp
+++ b/modules/gpuimgproc/perf/perf_match_template.cpp
@ -76,7 +76,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);

        GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
    }
@ -116,7 +118,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);

        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@ -47,7 +47,7 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_GPUARITHM) || defined (CUDA_DISABLER)

-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<gpu::TemplateMatching>(); }

 #else

@ -137,11 +137,8 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-using namespace ::cv::gpu::cudev::match_template;
-
 namespace
 {
-
    // Evaluates optimal template's area threshold. If
    // template's area is less  than the threshold, we use naive match
    // template version, otherwise FFT-based (if available)
@ -149,135 +146,317 @@ namespace
    {
        switch (method)
        {
-        case cv::TM_CCORR:
+        case TM_CCORR:
            if (depth == CV_32F) return 250;
            if (depth == CV_8U) return 300;
            break;
-        case cv::TM_SQDIFF:
+
+        case TM_SQDIFF:
            if (depth == CV_8U) return 300;
            break;
        }
-        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
+
+        CV_Error(Error::StsBadArg, "unsupported match template mode");
        return 0;
    }

+    ///////////////////////////////////////////////////////////////
+    // CCORR_32F

-    void matchTemplate_CCORR_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCORR_32F : public TemplateMatching
    {
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
+    public:
+        explicit Match_CCORR_32F(Size user_block_size);
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Ptr<gpu::Convolution> conv_;
+        GpuMat result_;
+    };
+
+    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
+    {
+        conv_ = gpu::createConvolution(user_block_size);
+    }
+
+    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
        {
-            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
            return;
        }

-        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
-
        if (image.channels() == 1)
        {
-            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
        }
        else
        {
-            GpuMat result_;
-            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
-            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
+            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
+            extractFirstChannel_32F(result_, result, image.channels(), stream);
        }
    }

+    ///////////////////////////////////////////////////////////////
+    // CCORR_8U

-    void matchTemplate_CCORR_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCORR_8U : public TemplateMatching
    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
+    public:
+        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match32F_;
+    };
+
+    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

-        image.convertTo(buf.imagef, CV_32F, stream);
-        templ.convertTo(buf.templf, CV_32F, stream);
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);

-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+        match32F_.match(imagef_, templf_, _result, stream);
    }

+    ///////////////////////////////////////////////////////////////
+    // CCORR_NORMED_8U

-    void matchTemplate_CCORR_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCORR_NORMED_8U : public TemplateMatching
    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
+        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }

-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
-        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    private:
+        Match_CCORR_8U match_CCORR_;
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+    };
+
+    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
+
+        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_32F

-    void matchTemplate_SQDIFF_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_SQDIFF_32F : public TemplateMatching
    {
-        (void)buf;
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    public:
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+    };
+
+    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }

+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_8U

-    void matchTemplate_SQDIFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_SQDIFF_8U : public TemplateMatching
    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
+    public:
+        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);

-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];

-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_NORMED_8U

-    void matchTemplate_SQDIFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_SQDIFF_NORMED_8U : public TemplateMatching
    {
-        buf.image_sqsums.resize(1);
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+    public:
+        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }

-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

+    ///////////////////////////////////////////////////////////////
+    // CCOFF_8U

-    void matchTemplate_CCOFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCOEFF_8U : public TemplateMatching
    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
+        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();

        if (image.channels() == 1)
        {
-            buf.image_sums.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
+            image_sums_.resize(1);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);

-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
-            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
+            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            gpu::split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
+            gpu::split(image, images_);
+
+            image_sums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);

            Scalar templ_sum = gpu::sum(templ);

@ -285,60 +464,91 @@ namespace
            {
            case 2:
                matchTemplatePrepared_CCOFF_8UC2(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                matchTemplatePrepared_CCOFF_8UC3(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                matchTemplatePrepared_CCOFF_8UC4(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
+                        result, StreamAccessor::getStream(stream));
                break;
            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
            }
        }
    }

+    ///////////////////////////////////////////////////////////////
+    // CCOFF_NORMED_8U

-    void matchTemplate_CCOFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
+    class Match_CCOEFF_NORMED_8U : public TemplateMatching
    {
-        image.convertTo(buf.imagef, CV_32F, stream);
-        templ.convertTo(buf.templf, CV_32F, stream);
+    public:
+        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
+        {
+        }

-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match_CCORR_32F_;
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        std::vector<GpuMat> image_sqsums_;
+    };
+
+    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
+        GpuMat result = _result.getGpuMat();

        if (image.channels() == 1)
        {
-            buf.image_sums.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
-            buf.image_sqsums.resize(1);
-            gpu::sqrIntegral(image, buf.image_sqsums[0], stream);
+            image_sums_.resize(1);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);

-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ)[0];
+            image_sqsums_.resize(1);
+            gpu::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
+
+            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
+            unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ)[0];

            matchTemplatePrepared_CCOFF_NORMED_8U(
-                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
+                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            gpu::split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            buf.image_sqsums.resize(buf.images.size());
+            gpu::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            image_sqsums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
            {
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
-                gpu::sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);
+                gpu::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
            }

            Scalar templ_sum = gpu::sum(templ);
@ -349,8 +559,8 @@ namespace
            case 2:
                matchTemplatePrepared_CCOFF_NORMED_8UC2(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        result, StreamAccessor::getStream(stream));
@ -358,9 +568,9 @@ namespace
            case 3:
                matchTemplatePrepared_CCOFF_NORMED_8UC3(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@ -369,10 +579,10 @@ namespace
            case 4:
                matchTemplatePrepared_CCOFF_NORMED_8UC4(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        buf.image_sums[3], buf.image_sqsums[3],
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        image_sums_[3], image_sqsums_[3],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@ -380,46 +590,60 @@ namespace
                        result, StreamAccessor::getStream(stream));
                break;
            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
            }
        }
    }
 }

-
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int srcType, int method, Size user_block_size)
 {
-    MatchTemplateBuf buf;
-    matchTemplate(image, templ, result, method, buf, stream);
-}
+    const int sdepth = CV_MAT_DEPTH(srcType);

+    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );

-void cv::gpu::matchTemplate(
-        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
-        MatchTemplateBuf &buf, Stream& stream)
-{
-    CV_Assert(image.type() == templ.type());
-    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
-
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
-
-    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
-                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
-                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
-    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
-                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
-
-    const Caller* callers = 0;
-    switch (image.depth())
+    if (sdepth == CV_32F)
    {
-        case CV_8U: callers = callers8U; break;
-        case CV_32F: callers = callers32F; break;
-        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
-    }
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_32F;

-    Caller caller = callers[method];
-    CV_Assert(caller);
-    caller(image, templ, result, buf, stream);
+        case TM_CCORR:
+            return new Match_CCORR_32F(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<gpu::TemplateMatching>();
+        }
+    }
+    else
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_8U(user_block_size);
+
+        case TM_SQDIFF_NORMED:
+            return new Match_SQDIFF_NORMED_8U(user_block_size);
+
+        case TM_CCORR:
+            return new Match_CCORR_8U(user_block_size);
+
+        case TM_CCORR_NORMED:
+            return new Match_CCORR_NORMED_8U(user_block_size);
+
+        case TM_CCOEFF:
+            return new Match_CCOEFF_8U(user_block_size);
+
+        case TM_CCOEFF_NORMED:
+            return new Match_CCOEFF_NORMED_8U(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<gpu::TemplateMatching>();
+        }
+    }
 }

 #endif
--- a/modules/gpuimgproc/test/test_match_template.cpp
+++ b/modules/gpuimgproc/test/test_match_template.cpp
@ -82,8 +82,10 @@ GPU_TEST_P(MatchTemplate8U, Accuracy)
    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);

    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
@ -128,8 +130,10 @@ GPU_TEST_P(MatchTemplate32F, Regression)
    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);

    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
@ -169,8 +173,10 @@ GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
    cv::Mat pattern = readImage("matchtemplate/cat.png");
    ASSERT_FALSE(pattern.empty());

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, method);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);

    cv::Mat dst(d_dst);

@ -214,8 +220,10 @@ GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
    cv::Mat pattern = readImage(patternName);
    ASSERT_FALSE(pattern.empty());

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), cv::TM_CCOEFF_NORMED);
+
    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, cv::TM_CCOEFF_NORMED);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);

    cv::Mat dst(d_dst);

@ -263,8 +271,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
    cv::Mat templ = readImage("matchtemplate/template.png");
    ASSERT_FALSE(templ.empty());

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF_NORMED);
+
    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF_NORMED);
+    alg->match(loadMat(scene), loadMat(templ), d_result);

    cv::Mat result(d_result);

@ -286,8 +296,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
    cv::Mat templ = readImage("matchtemplate/template.png");
    ASSERT_FALSE(templ.empty());

+    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF);
+
    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF);
+    alg->match(loadMat(scene), loadMat(templ), d_result);

    cv::Mat result(d_result);

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -17,24 +17,16 @@
 using namespace std;
 using namespace cv;

-static void InitMatchTemplate()
-{
-    Mat src; gen(src, 500, 500, CV_32F, 0, 1);
-    Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
-    gpu::GpuMat d_src(src), d_templ(templ), d_dst;
-    gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-}
-

 TEST(matchTemplate)
 {
-    InitMatchTemplate();
-
    Mat src, templ, dst;
    gen(src, 3000, 3000, CV_32F, 0, 1);

    gpu::GpuMat d_src(src), d_templ, d_dst;

+    Ptr<gpu::TemplateMatching> alg = gpu::createTemplateMatching(src.type(), TM_CCORR);
+
    for (int templ_size = 5; templ_size < 200; templ_size *= 5)
    {
        SUBTEST << src.cols << 'x' << src.rows << ", 32FC1" << ", templ " << templ_size << 'x' << templ_size << ", CCORR";
@ -47,10 +39,10 @@ TEST(matchTemplate)
        CPU_OFF;

        d_templ.upload(templ);
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);

        GPU_ON;
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);
        GPU_OFF;
    }
 }