From e746b3e8ae5e982e2ae71b8167b4478bf722e82e Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 1 Aug 2011 08:15:31 +0000
Subject: [PATCH] added buffered version of pyrDown and pyrUp added stream
 support to downsample, upsample, pyrUp and pyrDown

---
 modules/gpu/include/opencv2/gpu/gpu.hpp | 415 +++++++------
 modules/gpu/src/cuda/imgproc.cu         |  64 +-
 modules/gpu/src/imgproc_gpu.cpp         |  90 ++-
 modules/gpu/test/test_features2d.cpp    | 775 ------------------------
 samples/gpu/performance/tests.cpp       |  48 ++
 5 files changed, 377 insertions(+), 1015 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 393611e7fe..0d7b3f8cef 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -441,6 +441,191 @@ namespace cv
             
             explicit Stream(Impl* impl);
         };
+        
+
+        //////////////////////////////// Filter Engine ////////////////////////////////
+
+        /*!
+        The Base Class for 1D or Row-wise Filters
+
+        This is the base class for linear or non-linear filters that process 1D data.
+        In particular, such filters are used for the "horizontal" filtering parts in separable filters.
+        */
+        class CV_EXPORTS BaseRowFilter_GPU
+        {
+        public:
+            BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseRowFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            int ksize, anchor;
+        };
+
+        /*!
+        The Base Class for Column-wise Filters
+
+        This is the base class for linear or non-linear filters that process columns of 2D arrays.
+        Such filters are used for the "vertical" filtering parts in separable filters.
+        */
+        class CV_EXPORTS BaseColumnFilter_GPU
+        {
+        public:
+            BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseColumnFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            int ksize, anchor;
+        };
+
+        /*!
+        The Base Class for Non-Separable 2D Filters.
+
+        This is the base class for linear or non-linear 2D filters.
+        */
+        class CV_EXPORTS BaseFilter_GPU
+        {
+        public:
+            BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            Size ksize;
+            Point anchor;
+        };
+
+        /*!
+        The Base Class for Filter Engine.
+
+        The class can be used to apply an arbitrary filtering operation to an image.
+        It contains all the necessary intermediate buffers.
+        */
+        class CV_EXPORTS FilterEngine_GPU
+        {
+        public:
+            virtual ~FilterEngine_GPU() {}
+
+            virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
+        };
+
+        //! returns the non-separable filter engine with the specified filter
+        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
+
+        //! returns the separable filter engine with the specified filters
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
+
+        //! returns horizontal 1D box filter
+        //! supports only CV_8UC1 source type and CV_32FC1 sum type
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
+
+        //! returns vertical 1D box filter
+        //! supports only CV_8UC1 sum type and CV_32FC1 dst type
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
+
+        //! returns 2D box filter
+        //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
+        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
+
+        //! returns box filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
+            const Point& anchor = Point(-1,-1));
+
+        //! returns 2D morphological filter
+        //! only MORPH_ERODE and MORPH_DILATE are supported
+        //! supports CV_8UC1 and CV_8UC4 types
+        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
+            Point anchor=Point(-1,-1));
+
+        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
+        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
+            const Point& anchor = Point(-1,-1), int iterations = 1);
+
+        //! returns 2D filter with the specified kernel
+        //! supports CV_8UC1 and CV_8UC4 types
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,
+            Point anchor = Point(-1, -1));
+
+        //! returns the non-separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
+            const Point& anchor = Point(-1,-1));
+
+        //! returns the primitive row filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
+            int anchor = -1, int borderType = BORDER_CONSTANT);
+
+        //! returns the primitive column filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
+            int anchor = -1, int borderType = BORDER_CONSTANT);
+
+        //! returns the separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
+            int columnBorderType = -1);
+
+        //! returns filter engine for the generalized Sobel operator
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+
+        //! returns the Gaussian filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+
+        //! returns maximum filter
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+
+        //! returns minimum filter
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+
+        //! smooths the image using the normalized box filter
+        //! supports CV_8UC1, CV_8UC4 types
+        CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+
+        //! a synonym for normalized box filter
+        static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
+
+        //! erodes the image (applies the local minimum operator)
+        CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+
+        //! dilates the image (applies the local maximum operator)
+        CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+
+        //! applies an advanced morphological operation to the image
+        CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+
+        //! applies non-separable 2D linear filter to the image
+        CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
+
+        //! applies separable 2D linear filter to the image
+        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
+            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+
+        //! applies generalized Sobel operator to the image
+        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+
+        //! applies the vertical or horizontal Scharr operator to the image
+        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+
+        //! smooths the image using Gaussian filter.
+        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+
+        //! applies Laplacian operator to the image
+        //! supports only ksize = 1 and ksize = 3
+        CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());
 
 
         ////////////////////////////// Arithmetics ///////////////////////////////////
@@ -739,16 +924,54 @@ namespace cv
         CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
 
         //! downsamples image
-        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
 
         //! upsamples image
-        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst);
+        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
 
         //! smoothes the source image and downsamples it
-        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        
+        struct CV_EXPORTS PyrDownBuf;
+        
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream = Stream::Null());
+        
+        struct CV_EXPORTS PyrDownBuf
+        {
+            PyrDownBuf() : image_type(-1) {}
+            PyrDownBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); }
+            void create(Size image_size, int image_type_);
+
+        private:
+            friend void pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream& stream);
+
+            static Mat ker;
+            GpuMat buf;
+            Ptr<FilterEngine_GPU> filter;
+            int image_type;
+        };
 
         //! upsamples the source image and then smoothes it
-        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+        struct CV_EXPORTS PyrUpBuf;
+        
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream = Stream::Null());
+        
+        struct CV_EXPORTS PyrUpBuf
+        {
+            PyrUpBuf() : image_type(-1) {}
+            PyrUpBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); }
+            void create(Size image_size, int image_type_);
+
+        private:
+            friend void pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream& stream);
+
+            static Mat ker;
+            GpuMat buf;
+            Ptr<FilterEngine_GPU> filter;
+            int image_type;
+        };
 
         //! performs linear blending of two images
         //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
@@ -835,190 +1058,6 @@ namespace cv
                                        int num_iters=100, float max_dist=8.0, int min_inlier_count=100, 
                                        vector<int>* inliers=NULL);
 
-        //////////////////////////////// Filter Engine ////////////////////////////////
-
-        /*!
-        The Base Class for 1D or Row-wise Filters
-
-        This is the base class for linear or non-linear filters that process 1D data.
-        In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-        */
-        class CV_EXPORTS BaseRowFilter_GPU
-        {
-        public:
-            BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseRowFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            int ksize, anchor;
-        };
-
-        /*!
-        The Base Class for Column-wise Filters
-
-        This is the base class for linear or non-linear filters that process columns of 2D arrays.
-        Such filters are used for the "vertical" filtering parts in separable filters.
-        */
-        class CV_EXPORTS BaseColumnFilter_GPU
-        {
-        public:
-            BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseColumnFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            int ksize, anchor;
-        };
-
-        /*!
-        The Base Class for Non-Separable 2D Filters.
-
-        This is the base class for linear or non-linear 2D filters.
-        */
-        class CV_EXPORTS BaseFilter_GPU
-        {
-        public:
-            BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            Size ksize;
-            Point anchor;
-        };
-
-        /*!
-        The Base Class for Filter Engine.
-
-        The class can be used to apply an arbitrary filtering operation to an image.
-        It contains all the necessary intermediate buffers.
-        */
-        class CV_EXPORTS FilterEngine_GPU
-        {
-        public:
-            virtual ~FilterEngine_GPU() {}
-
-            virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-        };
-
-        //! returns the non-separable filter engine with the specified filter
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
-
-        //! returns the separable filter engine with the specified filters
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-
-        //! returns horizontal 1D box filter
-        //! supports only CV_8UC1 source type and CV_32FC1 sum type
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-
-        //! returns vertical 1D box filter
-        //! supports only CV_8UC1 sum type and CV_32FC1 dst type
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-
-        //! returns 2D box filter
-        //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-
-        //! returns box filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-            const Point& anchor = Point(-1,-1));
-
-        //! returns 2D morphological filter
-        //! only MORPH_ERODE and MORPH_DILATE are supported
-        //! supports CV_8UC1 and CV_8UC4 types
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-            Point anchor=Point(-1,-1));
-
-        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-            const Point& anchor = Point(-1,-1), int iterations = 1);
-
-        //! returns 2D filter with the specified kernel
-        //! supports CV_8UC1 and CV_8UC4 types
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,
-            Point anchor = Point(-1, -1));
-
-        //! returns the non-separable linear filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-            const Point& anchor = Point(-1,-1));
-
-        //! returns the primitive row filter with the specified kernel.
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-        //! there are two version of algorithm: NPP and OpenCV.
-        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-        //! otherwise calls OpenCV version.
-        //! NPP supports only BORDER_CONSTANT border type.
-        //! OpenCV version supports only CV_32F as buffer depth and
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-            int anchor = -1, int borderType = BORDER_CONSTANT);
-
-        //! returns the primitive column filter with the specified kernel.
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-        //! there are two version of algorithm: NPP and OpenCV.
-        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-        //! otherwise calls OpenCV version.
-        //! NPP supports only BORDER_CONSTANT border type.
-        //! OpenCV version supports only CV_32F as buffer depth and
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-            int anchor = -1, int borderType = BORDER_CONSTANT);
-
-        //! returns the separable linear filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-            int columnBorderType = -1);
-
-        //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-        //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-        //! returns maximum filter
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-        //! returns minimum filter
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-        //! smooths the image using the normalized box filter
-        //! supports CV_8UC1, CV_8UC4 types
-        CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
-
-        //! a synonym for normalized box filter
-        static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
-
-        //! erodes the image (applies the local minimum operator)
-        CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-        //! dilates the image (applies the local maximum operator)
-        CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-        //! applies an advanced morphological operation to the image
-        CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-        //! applies non-separable 2D linear filter to the image
-        CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
-
-        //! applies separable 2D linear filter to the image
-        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-        //! applies generalized Sobel operator to the image
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-        //! applies the vertical or horizontal Scharr operator to the image
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-        //! smooths the image using Gaussian filter.
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-        //! applies Laplacian operator to the image
-        //! supports only ksize = 1 and ksize = 3
-        CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());
-
         //////////////////////////////// Image Labeling ////////////////////////////////
 
         //!performs labeling via graph cuts
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index bb86cc8efb..def579702c 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -908,29 +908,31 @@ namespace cv { namespace gpu { namespace imgproc
 
 
     template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst)
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
     {
         dim3 threads(32, 8);
         dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
 
-        downsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
         cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+        
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
     }
 
 
-    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 
 
     //////////////////////////////////////////////////////////////////////////
@@ -952,29 +954,31 @@ namespace cv { namespace gpu { namespace imgproc
 
 
     template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst)
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
     {
         dim3 threads(32, 8);
         dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
 
-        upsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
         cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
     }
 
 
-    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 52e23d60b1..69ac7c9079 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -79,10 +79,14 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::upsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::PyrDownBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::PyrUpBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream&) { throw_nogpu(); }
 
 
 
@@ -1413,15 +1417,15 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst);
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}
 
 
-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
 
-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
     static const Caller callers[6][4] =
         {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
           imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
@@ -1437,7 +1441,7 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
         CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
 
     dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }
 
 
@@ -1447,15 +1451,15 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
 namespace cv { namespace gpu { namespace imgproc
 {
     template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst);
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}
 
 
-void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
     CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
 
-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
     static const Caller callers[6][5] =
         {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
           imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
@@ -1471,31 +1475,73 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
         CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
 
     dst.create(src.rows*2, src.cols*2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }
 
 
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown
 
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth()));
-    GpuMat buf;
-    sepFilter2D(src, buf, src.depth(), ker, ker);
-    downsample(buf, dst);
+    PyrDownBuf buf;
+    pyrDown(src, dst, buf, stream);
+}
+
+cv::Mat cv::gpu::PyrDownBuf::ker;
+
+void cv::gpu::PyrDownBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_)));
+
+    ensureSizeIsEnough(image_size.height, image_size.width, image_type_, buf);
+
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream)
+{
+    buf.create(src.size(), src.type());
+    buf.filter->apply(src, buf.buf, Rect(0, 0, src.cols, src.rows), stream);
+    downsample(buf.buf, dst, stream);
 }
 
 
 //////////////////////////////////////////////////////////////////////////////
 // pyrUp
 
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    GpuMat buf;
-    upsample(src, buf);
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth())) * 2;
-    sepFilter2D(buf, dst, buf.depth(), ker, ker);
+    PyrUpBuf buf;
+    pyrUp(src, dst, buf, stream);
+}
+
+cv::Mat cv::gpu::PyrUpBuf::ker;
+
+void cv::gpu::PyrUpBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_))) * 2;
+
+    ensureSizeIsEnough(image_size.height * 2, image_size.width * 2, image_type_, buf);
+
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream)
+{
+    buf.create(src.size(), src.type());
+    upsample(src, buf.buf, stream);
+    buf.filter->apply(buf.buf, dst, Rect(0, 0, buf.buf.cols, buf.buf.rows), stream);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index 083a35bcfa..0479cc2fa4 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -560,778 +560,3 @@ INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher, testing::Combine(
                         testing::Values(57, 64, 83, 128, 179, 256, 304)));
 
 #endif // HAVE_CUDA
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//struct CV_GpuBFMTest : CV_GpuTestBase
-//{
-//    void run_gpu_test();
-//       
-//    void generateData(GpuMat& query, GpuMat& train, int dim, int depth);
-//
-//    virtual void test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher) = 0;
-//
-//    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two
-//    static const int countFactor = 4; // do not change it
-//};
-//
-//void CV_GpuBFMTest::run_gpu_test()
-//{
-//    BruteForceMatcher_GPU_base::DistType dists[] = {BruteForceMatcher_GPU_base::L1Dist, BruteForceMatcher_GPU_base::L2Dist, BruteForceMatcher_GPU_base::HammingDist};
-//    const char* dists_str[] = {"L1Dist", "L2Dist", "HammingDist"};
-//    int dists_count = sizeof(dists) / sizeof(dists[0]);
-//
-//    RNG rng = ts->get_rng();
-//
-//    int dims[] = {rng.uniform(30, 60), 64, rng.uniform(70, 110), 128, rng.uniform(130, 250), 256, rng.uniform(260, 350)};
-//    int dims_count = sizeof(dims) / sizeof(dims[0]);
-//
-//    for (int dist = 0; dist < dists_count; ++dist)
-//    {
-//        int depth_end = dists[dist] == BruteForceMatcher_GPU_base::HammingDist ? CV_32S : CV_32F;
-//
-//        for (int depth = CV_8U; depth <= depth_end; ++depth)
-//        {
-//            for (int dim = 0; dim < dims_count; ++dim)
-//            {
-//                PRINT_ARGS("dist=%s depth=%s dim=%d", dists_str[dist], getTypeName(depth), dims[dim]);
-//                
-//                BruteForceMatcher_GPU_base matcher(dists[dist]);
-//
-//                GpuMat query, train;
-//                generateData(query, train, dim, depth);
-//
-//                test(query, train, matcher);
-//            }
-//        }
-//    }
-//}
-//
-//void CV_GpuBFMTest::generateData(GpuMat& queryGPU, GpuMat& trainGPU, int dim, int depth)
-//{
-//    RNG& rng = ts->get_rng();
-//
-//    Mat queryBuf, trainBuf;
-//
-//    // Generate query descriptors randomly.
-//    // Descriptor vector elements are integer values.
-//    queryBuf.create(queryDescCount, dim, CV_32SC1);
-//    rng.fill(queryBuf, RNG::UNIFORM, Scalar::all(0), Scalar(3));
-//    queryBuf.convertTo(queryBuf, CV_32FC1);
-//
-//    // Generate train decriptors as follows:
-//    // copy each query descriptor to train set countFactor times
-//    // and perturb some one element of the copied descriptors in
-//    // in ascending order. General boundaries of the perturbation
-//    // are (0.f, 1.f).
-//    trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
-//    float step = 1.f / countFactor;
-//    for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
-//    {
-//        Mat queryDescriptor = queryBuf.row(qIdx);
-//        for (int c = 0; c < countFactor; c++)
-//        {
-//            int tIdx = qIdx * countFactor + c;
-//            Mat trainDescriptor = trainBuf.row(tIdx);
-//            queryDescriptor.copyTo(trainDescriptor);
-//            int elem = rng(dim);
-//            float diff = rng.uniform(step * c, step * (c + 1));
-//            trainDescriptor.at<float>(0, elem) += diff;
-//        }
-//    }
-//
-//    Mat query, train;
-//    queryBuf.convertTo(query, depth);
-//    trainBuf.convertTo(train, depth);
-//
-//    queryGPU.upload(query);
-//    trainGPU.upload(train);
-//}
-//
-//#define GPU_BFM_TEST(test_name) 
-//    struct CV_GpuBFM_ ##test_name ## _Test : CV_GpuBFMTest 
-//    { 
-//        void test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher); 
-//    }; 
-//    TEST(BruteForceMatcher, test_name) { CV_GpuBFM_ ##test_name ## _Test test; test.safe_run(); } 
-//    void CV_GpuBFM_ ##test_name ## _Test::test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher)
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// match
-//
-//GPU_BFM_TEST(match)
-//{
-//    vector<DMatch> matches;
-//
-//    matcher.match(query, train, matches);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        DMatch match = matches[i];
-//        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-//            badCount++;
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(match_add)
-//{
-//    vector<DMatch> matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1, train.rowRange(0, train.rows/2)));
-//    matcher.add(vector<GpuMat>(1, train.rowRange(train.rows/2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++)
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows/2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount/2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.match(query, matches, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        DMatch match = matches[i];
-//        int shift = matcher.isMaskSupported() ? 1 : 0;
-//        {
-//            if (i < queryDescCount / 2)
-//            {
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + shift) || (match.imgIdx != 0))
-//                    badCount++;
-//            }
-//            else
-//            {
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + shift) || (match.imgIdx != 1))
-//                    badCount++;
-//            }
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// knnMatch
-//
-//GPU_BFM_TEST(knnMatch)
-//{
-//    const int knn = 3;
-//
-//    vector< vector<DMatch> > matches;
-//
-//    matcher.knnMatch(query, train, matches, knn);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != knn)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < knn; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
-//                    localBadCount++;
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(knnMatch_add)
-//{
-//    const int knn = 2;
-//    vector<vector<DMatch> > matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-//    matcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++ )
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount / 2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.knnMatch(query, matches, knn, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    int shift = matcher.isMaskSupported() ? 1 : 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != knn)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < knn; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                {
-//                    if (i < queryDescCount / 2)
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-//                            localBadCount++;
-//                    }
-//                    else
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-//                            localBadCount++;
-//                    }
-//                }
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// radiusMatch
-//
-//GPU_BFM_TEST(radiusMatch)
-//{
-//    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-//
-//    const float radius = 1.f / countFactor;
-//
-//    vector< vector<DMatch> > matches;
-//
-//    matcher.radiusMatch(query, train, matches, radius);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != 1)
-//            badCount++;
-//        else
-//        {
-//            DMatch match = matches[i][0];
-//            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-//                badCount++;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(radiusMatch_add)
-//{
-//    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-//
-//    int n = 3;
-//    const float radius = 1.f / countFactor * n;
-//    vector< vector<DMatch> > matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-//    matcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++)
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount / 2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.radiusMatch(query, matches, radius, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    int shift = matcher.isMaskSupported() ? 1 : 0;
-//    int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != needMatchCount)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < needMatchCount; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                {
-//                    if (i < queryDescCount / 2)
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-//                            localBadCount++;
-//                    }
-//                    else
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-//                            localBadCount++;
-//                    }
-//                }
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-////struct CV_GpuBruteForceMatcherTest : CV_GpuTestBase
-////{
-////    void run_gpu_test();
-////    
-////    void emptyDataTest();
-////    void dataTest(int dim);
-////    
-////    void generateData(GpuMat& query, GpuMat& train, int dim);
-////
-////    void matchTest(const GpuMat& query, const GpuMat& train);
-////    void knnMatchTest(const GpuMat& query, const GpuMat& train);
-////    void radiusMatchTest(const GpuMat& query, const GpuMat& train);
-////
-////    BruteForceMatcher_GPU< L2<float> > dmatcher;
-////
-////    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two
-////    static const int countFactor = 4; // do not change it
-////};
-////
-////void CV_GpuBruteForceMatcherTest::emptyDataTest()
-////{
-////    GpuMat queryDescriptors, trainDescriptors, mask;
-////    vector<GpuMat> trainDescriptorCollection, masks;
-////    vector<DMatch> matches;
-////    vector< vector<DMatch> > vmatches;
-////
-////    try
-////    {
-////        dmatcher.match(queryDescriptors, trainDescriptors, matches, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("match() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.knnMatch(queryDescriptors, trainDescriptors, vmatches, 2, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("knnMatch() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.radiusMatch(queryDescriptors, trainDescriptors, vmatches, 10.f, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("radiusMatch() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.add(trainDescriptorCollection);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("add() on empty descriptors must not generate exception");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.match(queryDescriptors, matches, masks);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("match() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.knnMatch(queryDescriptors, vmatches, 2, masks);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("knnMatch() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.radiusMatch( queryDescriptors, vmatches, 10.f, masks );
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("radiusMatch() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////}
-////
-////void CV_GpuBruteForceMatcherTest::generateData(GpuMat& queryGPU, GpuMat& trainGPU, int dim)
-////{
-////    Mat query, train;
-////    RNG& rng = ts->get_rng();
-////
-////    // Generate query descriptors randomly.
-////    // Descriptor vector elements are integer values.
-////    Mat buf(queryDescCount, dim, CV_32SC1);
-////    rng.fill(buf, RNG::UNIFORM, Scalar::all(0), Scalar(3));
-////    buf.convertTo(query, CV_32FC1);
-////
-////    // Generate train decriptors as follows:
-////    // copy each query descriptor to train set countFactor times
-////    // and perturb some one element of the copied descriptors in
-////    // in ascending order. General boundaries of the perturbation
-////    // are (0.f, 1.f).
-////    train.create( query.rows*countFactor, query.cols, CV_32FC1 );
-////    float step = 1.f / countFactor;
-////    for (int qIdx = 0; qIdx < query.rows; qIdx++)
-////    {
-////        Mat queryDescriptor = query.row(qIdx);
-////        for (int c = 0; c < countFactor; c++)
-////        {
-////            int tIdx = qIdx * countFactor + c;
-////            Mat trainDescriptor = train.row(tIdx);
-////            queryDescriptor.copyTo(trainDescriptor);
-////            int elem = rng(dim);
-////            float diff = rng.uniform(step * c, step * (c + 1));
-////            trainDescriptor.at<float>(0, elem) += diff;
-////        }
-////    }
-////
-////    queryGPU.upload(query);
-////    trainGPU.upload(train);
-////}
-////
-////void CV_GpuBruteForceMatcherTest::matchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    dmatcher.clear();
-////
-////    // test const version of match()
-////    {
-////        vector<DMatch> matches;
-////        dmatcher.match(query, train, matches);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            DMatch match = matches[i];
-////            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-////                badCount++;
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of match() with add()
-////    {
-////        vector<DMatch> matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1, train.rowRange(0, train.rows/2)));
-////        dmatcher.add(vector<GpuMat>(1, train.rowRange(train.rows/2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++)
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows/2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount/2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.match(query, matches, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            DMatch match = matches[i];
-////            int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////            {
-////                if (i < queryDescCount / 2)
-////                {
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + shift) || (match.imgIdx != 0))
-////                        badCount++;
-////                }
-////                else
-////                {
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + shift) || (match.imgIdx != 1))
-////                        badCount++;
-////                }
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::knnMatchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    dmatcher.clear();
-////
-////    // test const version of knnMatch()
-////    {
-////        const int knn = 3;
-////
-////        vector< vector<DMatch> > matches;
-////        dmatcher.knnMatch(query, train, matches, knn);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != knn)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < knn; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
-////                        localBadCount++;
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of knnMatch() with add()
-////    {
-////        const int knn = 2;
-////        vector<vector<DMatch> > matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++ )
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount / 2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.knnMatch(query, matches, knn, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != knn)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < knn; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    {
-////                        if (i < queryDescCount / 2)
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-////                                localBadCount++;
-////                        }
-////                        else
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-////                                localBadCount++;
-////                        }
-////                    }
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::radiusMatchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-////
-////    dmatcher.clear();
-////
-////    // test const version of match()
-////    {
-////        const float radius = 1.f / countFactor;
-////
-////        vector< vector<DMatch> > matches;
-////        dmatcher.radiusMatch(query, train, matches, radius);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != 1)
-////                badCount++;
-////            else
-////            {
-////                DMatch match = matches[i][0];
-////                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-////                    badCount++;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of match() with add()
-////    {
-////        int n = 3;
-////        const float radius = 1.f / countFactor * n;
-////        vector< vector<DMatch> > matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++)
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount / 2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.radiusMatch(query, matches, radius, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////        int needMatchCount = dmatcher.isMaskSupported() ? n-1 : n;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != needMatchCount)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < needMatchCount; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    {
-////                        if (i < queryDescCount / 2)
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-////                                localBadCount++;
-////                        }
-////                        else
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-////                                localBadCount++;
-////                        }
-////                    }
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::dataTest(int dim)
-////{
-////    GpuMat query, train;
-////    generateData(query, train, dim);
-////
-////    matchTest(query, train);
-////    knnMatchTest(query, train);
-////    radiusMatchTest(query, train);
-////
-////    dmatcher.clear();
-////}
-////
-////void CV_GpuBruteForceMatcherTest::run_gpu_test()
-////{
-////    emptyDataTest();
-////
-////    dataTest(50);
-////    dataTest(64);
-////    dataTest(100);
-////    dataTest(128);
-////    dataTest(200);
-////    dataTest(256);
-////    dataTest(300);
-////}
-////
-////TEST(BruteForceMatcher, accuracy) { CV_GpuBruteForceMatcherTest test; test.safe_run(); }
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 48def0cffe..630708b4ee 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -866,3 +866,51 @@ TEST(GaussianBlur)
         GPU_OFF;
     }
 }
+
+TEST(pyrDown)
+{
+    gpu::PyrDownBuf buf;
+
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols / 2, src.rows / 2), src.type());
+
+        CPU_ON;
+        pyrDown(src, dst);
+        CPU_OFF;
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type());
+
+        GPU_ON;
+        gpu::pyrDown(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}
+
+TEST(pyrUp)
+{
+    gpu::PyrUpBuf buf;
+
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols * 2, src.rows * 2), src.type());
+
+        CPU_ON;
+        pyrUp(src, dst);
+        CPU_OFF;
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type());
+
+        GPU_ON;
+        gpu::pyrUp(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}