diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 899091b91e..4c17d51102 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -299,7 +299,6 @@ void cv::gpu::DeviceInfo::query()
     multi_processor_count_ = prop.multiProcessorCount;
     majorVersion_ = prop.major;
     minorVersion_ = prop.minor;
-    sharedMemPerBlock_ = prop.sharedMemPerBlock;
 }
 
 void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index f62b9a4f15..c7f60b0b6e 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -527,7 +527,6 @@ CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream
 
 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
 
 //! performs per-elements bit-wise inversion
 CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
@@ -1325,10 +1324,12 @@ protected:
 
 ////////////////////////////////// BruteForceMatcher //////////////////////////////////
 
-class CV_EXPORTS BFMatcher_GPU
+class CV_EXPORTS BruteForceMatcher_GPU_base
 {
 public:
-    explicit BFMatcher_GPU(int norm = cv::NORM_L2);
+    enum DistType {L1Dist = 0, L2Dist, HammingDist};
+
+    explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);
 
     // Add descriptors to train descriptor collection
     void add(const std::vector<GpuMat>& descCollection);
@@ -1470,7 +1471,7 @@ public:
     void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
         const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
 
-    int norm;
+    DistType distType;
 
 private:
     std::vector<GpuMat> trainDescCollection;
@@ -1480,24 +1481,24 @@ template <class Distance>
 class CV_EXPORTS BruteForceMatcher_GPU;
 
 template <typename T>
-class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BFMatcher_GPU
+class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base
 {
 public:
-    explicit BruteForceMatcher_GPU() : BFMatcher_GPU(NORM_L1) {}
-    explicit BruteForceMatcher_GPU(L1<T> /*d*/) : BFMatcher_GPU(NORM_L1) {}
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L1Dist) {}
+    explicit BruteForceMatcher_GPU(L1<T> /*d*/) : BruteForceMatcher_GPU_base(L1Dist) {}
 };
 template <typename T>
-class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BFMatcher_GPU
+class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base
 {
 public:
-    explicit BruteForceMatcher_GPU() : BFMatcher_GPU(NORM_L2) {}
-    explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BFMatcher_GPU(NORM_L2) {}
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {}
+    explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}
 };
-template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BFMatcher_GPU
+template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base
 {
 public:
-    explicit BruteForceMatcher_GPU() : BFMatcher_GPU(NORM_HAMMING) {}
-    explicit BruteForceMatcher_GPU(Hamming /*d*/) : BFMatcher_GPU(NORM_HAMMING) {}
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}
+    explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}
 };
 
 ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
@@ -1514,7 +1515,7 @@ public:
     void release();
 
     /* returns number of detected objects */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4, Size minSize = Size());
 
     bool findLargestObject;
     bool visualizeInPlace;
@@ -1522,14 +1523,12 @@ public:
     Size getClassifierSize() const;
 
 private:
+
     struct CascadeClassifierImpl;
     CascadeClassifierImpl* impl;
     struct HaarCascade;
     struct LbpCascade;
     friend class CascadeClassifier_GPU_LBP;
-
-public:
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };
 
 ////////////////////////////////// SURF //////////////////////////////////////////
@@ -1559,12 +1558,12 @@ public:
     int descriptorSize() const;
 
     //! upload host keypoints to device memory
-    static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
+    void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
     //! download keypoints from device to host memory
-    static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
+    void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
 
     //! download descriptors from device to host memory
-    static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
+    void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
 
     //! finds the keypoints using fast hessian detector used in SURF
     //! supports CV_8UC1 images
@@ -1631,10 +1630,10 @@ public:
     void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
 
     //! download keypoints from device to host memory
-    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+    void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
 
     //! convert keypoints to KeyPoint vector
-    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
+    void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
 
     //! release temporary buffer's memory
     void release();
@@ -1705,9 +1704,10 @@ public:
     void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
 
     //! download keypoints from device to host memory
-    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+    void downloadKeyPoints(GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
     //! convert keypoints to KeyPoint vector
-    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
+    void convertKeyPoints(Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
 
     //! returns the descriptor size in bytes
     inline int descriptorSize() const { return kBytes; }
@@ -1855,28 +1855,62 @@ inline GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU(int maxC
 class CV_EXPORTS PyrLKOpticalFlow
 {
 public:
-    PyrLKOpticalFlow();
+    PyrLKOpticalFlow()
+    {
+        winSize = Size(21, 21);
+        maxLevel = 3;
+        iters = 30;
+        derivLambda = 0.5;
+        useInitialFlow = false;
+        minEigThreshold = 1e-4f;
+        getMinEigenVals = false;
+        isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
+    }
 
     void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
         GpuMat& status, GpuMat* err = 0);
 
     void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
 
-    void releaseMemory();
-
     Size winSize;
     int maxLevel;
     int iters;
+    double derivLambda;
     bool useInitialFlow;
+    float minEigThreshold;
+    bool getMinEigenVals;
+
+    void releaseMemory()
+    {
+        dx_calcBuf_.release();
+        dy_calcBuf_.release();
+
+        prevPyr_.clear();
+        nextPyr_.clear();
+
+        dx_buf_.release();
+        dy_buf_.release();
+
+        uPyr_.clear();
+        vPyr_.clear();
+    }
 
 private:
+    void calcSharrDeriv(const GpuMat& src, GpuMat& dx, GpuMat& dy);
+
+    void buildImagePyramid(const GpuMat& img0, vector<GpuMat>& pyr, bool withBorder);
+
+    GpuMat dx_calcBuf_;
+    GpuMat dy_calcBuf_;
+
     vector<GpuMat> prevPyr_;
     vector<GpuMat> nextPyr_;
 
-    GpuMat buf_;
+    GpuMat dx_buf_;
+    GpuMat dy_buf_;
 
-    GpuMat uPyr_[2];
-    GpuMat vPyr_[2];
+    vector<GpuMat> uPyr_;
+    vector<GpuMat> vPyr_;
 
     bool isDeviceArch11_;
 };
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index 86051699a3..0580a30f0f 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -841,49 +841,6 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
     }
 }
 
-//////////////////////////////////////////////////////////////////////
-// CompareScalar
-
-PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_DEPTH, ALL_CMP_CODES))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int cmp_code = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fillRandom(src);
-
-    cv::Scalar s = cv::Scalar::all(100);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::compare(d_src, s, d_dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::compare(d_src, s, d_dst, cmp_code);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::compare(src, s, dst, cmp_code);
-
-        TEST_CYCLE()
-        {
-            cv::compare(src, s, dst, cmp_code);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // BitwiseNot
 
diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp
index 7b2be25634..66a1b16976 100644
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -161,7 +161,8 @@ PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Val
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
+        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
+            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
@@ -220,7 +221,8 @@ PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
+        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
+            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
@@ -273,7 +275,8 @@ PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
+        cv::gpu::BruteForceMatcher_GPU_base d_matcher(
+            cv::gpu::BruteForceMatcher_GPU_base::DistType((normType -2) / 2));
 
         cv::gpu::GpuMat d_query(query);
         cv::gpu::GpuMat d_train(train);
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 9b8a9c9b73..849fc3f089 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -46,39 +46,39 @@ using namespace cv;
 using namespace cv::gpu;
 using namespace std;
 
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)
 
-cv::gpu::BFMatcher_GPU::BFMatcher_GPU(int) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::add(const vector<GpuMat>&) { throw_nogpu(); }
-const vector<GpuMat>& cv::gpu::BFMatcher_GPU::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }
-void cv::gpu::BFMatcher_GPU::clear() { throw_nogpu(); }
-bool cv::gpu::BFMatcher_GPU::empty() const { throw_nogpu(); return true; }
-bool cv::gpu::BFMatcher_GPU::isMaskSupported() const { throw_nogpu(); return true; }
-void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat&, const Mat&, const Mat&, vector<DMatch>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::match(const GpuMat&, vector<DMatch>&, const vector<GpuMat>&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat&, vector< vector<DMatch> >&, int, const vector<GpuMat>&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const vector<GpuMat>&, Stream&) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat&, vector< vector<DMatch> >&, float, const vector<GpuMat>&, bool) { throw_nogpu(); }
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>&) { throw_nogpu(); }
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }
+void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat&, const Mat&, const Mat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, vector<DMatch>&, const vector<GpuMat>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, vector< vector<DMatch> >&, int, const vector<GpuMat>&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const vector<GpuMat>&, Stream&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, vector< vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vector<DMatch> >&, float, const vector<GpuMat>&, bool) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -159,31 +159,31 @@ namespace cv { namespace gpu { namespace device
 ////////////////////////////////////////////////////////////////////
 // Train collection
 
-cv::gpu::BFMatcher_GPU::BFMatcher_GPU(int norm_) : norm(norm_)
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
 {
 }
 
-void cv::gpu::BFMatcher_GPU::add(const vector<GpuMat>& descCollection)
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
 {
     trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
 }
 
-const vector<GpuMat>& cv::gpu::BFMatcher_GPU::getTrainDescriptors() const
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const
 {
     return trainDescCollection;
 }
 
-void cv::gpu::BFMatcher_GPU::clear()
+void cv::gpu::BruteForceMatcher_GPU_base::clear()
 {
     trainDescCollection.clear();
 }
 
-bool cv::gpu::BFMatcher_GPU::empty() const
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const
 {
     return trainDescCollection.empty();
 }
 
-bool cv::gpu::BFMatcher_GPU::isMaskSupported() const
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
 {
     return true;
 }
@@ -191,51 +191,47 @@ bool cv::gpu::BFMatcher_GPU::isMaskSupported() const
 ////////////////////////////////////////////////////////////////////
 // Match
 
-void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const GpuMat& train,
     GpuMat& trainIdx, GpuMat& distance,
     const GpuMat& mask, Stream& stream)
 {
     if (query.empty() || train.empty())
         return;
 
-    using namespace cv::gpu::device::bf_match;
+    using namespace ::cv::gpu::device::bf_match;
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        }
     };
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(train.cols == query.cols && train.type() == query.type());
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     const int nQuery = query.rows;
 
     ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
     ensureSizeIsEnough(1, nQuery, CV_32F, distance);
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     DeviceInfo info;
@@ -244,7 +240,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
     func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
 {
     if (trainIdx.empty() || distance.empty())
         return;
@@ -255,7 +251,7 @@ void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat&
     matchConvert(trainIdxCPU, distanceCPU, matches);
 }
 
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& distance, vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& distance, vector<DMatch>& matches)
 {
     if (trainIdx.empty() || distance.empty())
         return;
@@ -272,20 +268,20 @@ void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& distan
     const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
     {
-        int train_idx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;
 
-        if (train_idx == -1)
+        if (_trainIdx == -1)
             continue;
 
-        float distance_local = *distance_ptr;
+        float _distance = *distance_ptr;
 
-        DMatch m(queryIdx, train_idx, 0, distance_local);
+        DMatch m(queryIdx, _trainIdx, 0, _distance);
 
         matches.push_back(m);
     }
 }
 
-void cv::gpu::BFMatcher_GPU::match(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& query, const GpuMat& train,
     vector<DMatch>& matches, const GpuMat& mask)
 {
     GpuMat trainIdx, distance;
@@ -293,7 +289,7 @@ void cv::gpu::BFMatcher_GPU::match(const GpuMat& query, const GpuMat& train,
     matchDownload(trainIdx, distance, matches);
 }
 
-void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
     const vector<GpuMat>& masks)
 {
     if (empty())
@@ -337,42 +333,39 @@ void cv::gpu::BFMatcher_GPU::makeGpuCollection(GpuMat& trainCollection, GpuMat&
     }
 }
 
-void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat& trainCollection,
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, const GpuMat& trainCollection,
     GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
     const GpuMat& masks, Stream& stream)
 {
     if (query.empty() || trainCollection.empty())
         return;
 
-    using namespace cv::gpu::device::bf_match;
+    using namespace ::cv::gpu::device::bf_match;
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        }
     };
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     const int nQuery = query.rows;
 
@@ -380,7 +373,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
     ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
     ensureSizeIsEnough(1, nQuery, CV_32F, distance);
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     DeviceInfo info;
@@ -389,7 +382,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
     func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty())
         return;
@@ -401,7 +394,7 @@ void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat&
     matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
 }
 
-void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches)
+void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty())
         return;
@@ -420,22 +413,22 @@ void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx
     const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
     {
-        int _trainIdx = *trainIdx_ptr;
+        int trainIdx = *trainIdx_ptr;
 
-        if (_trainIdx == -1)
+        if (trainIdx == -1)
             continue;
 
-        int _imgIdx = *imgIdx_ptr;
+        int imgIdx = *imgIdx_ptr;
 
-        float _distance = *distance_ptr;
+        float distance = *distance_ptr;
 
-        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
+        DMatch m(queryIdx, trainIdx, imgIdx, distance);
 
         matches.push_back(m);
     }
 }
 
-void cv::gpu::BFMatcher_GPU::match(const GpuMat& query, vector<DMatch>& matches, const vector<GpuMat>& masks)
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& query, vector<DMatch>& matches, const vector<GpuMat>& masks)
 {
     GpuMat trainCollection;
     GpuMat maskCollection;
@@ -451,43 +444,40 @@ void cv::gpu::BFMatcher_GPU::match(const GpuMat& query, vector<DMatch>& matches,
 ////////////////////////////////////////////////////////////////////
 // KnnMatch
 
-void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, const GpuMat& train,
     GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
     const GpuMat& mask, Stream& stream)
 {
     if (query.empty() || train.empty())
         return;
 
-    using namespace cv::gpu::device::bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        }
     };
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     const int nQuery = query.rows;
     const int nTrain = train.rows;
@@ -509,7 +499,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
     else
         trainIdx.setTo(Scalar::all(-1));
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     DeviceInfo info;
@@ -518,7 +508,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
     func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || distance.empty())
@@ -530,7 +520,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuM
     knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& distance,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || distance.empty())
@@ -558,13 +548,13 @@ void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& dis
 
         for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
         {
-            int _trainIdx = *trainIdx_ptr;
+            int trainIdx = *trainIdx_ptr;
 
-            if (_trainIdx != -1)
+            if (trainIdx != -1)
             {
-                float _distance = *distance_ptr;
+                float distance = *distance_ptr;
 
-                DMatch m(queryIdx, _trainIdx, 0, _distance);
+                DMatch m(queryIdx, trainIdx, 0, distance);
 
                 curMatches.push_back(m);
             }
@@ -575,7 +565,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& dis
     }
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& query, const GpuMat& train,
     vector< vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)
 {
     GpuMat trainIdx, distance, allDist;
@@ -583,42 +573,39 @@ void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, const GpuMat& train,
     knnMatchDownload(trainIdx, distance, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
     GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
     const GpuMat& maskCollection, Stream& stream)
 {
     if (query.empty() || trainCollection.empty())
         return;
 
-    using namespace cv::gpu::device::bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;
 
     typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
-        match2L1_gpu<unsigned short>, match2L1_gpu<short>,
-        match2L1_gpu<int>, match2L1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
-        0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
-        0/*match2L2_gpu<int>*/, match2L2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
-        match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
-        match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
+        {
+            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
+            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
+            match2L1_gpu<int>, match2L1_gpu<float>
+        },
+        {
+            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
+            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
+            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
+        },
+        {
+            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
+            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
+            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
+        }
     };
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     const int nQuery = query.rows;
 
@@ -631,7 +618,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
     else
         trainIdx.setTo(Scalar::all(-1));
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     DeviceInfo info;
@@ -640,7 +627,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
     func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty())
@@ -653,7 +640,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const Gpu
     knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty())
@@ -680,15 +667,15 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& im
 
         for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
         {
-            int _trainIdx = *trainIdx_ptr;
+            int trainIdx = *trainIdx_ptr;
 
-            if (_trainIdx != -1)
+            if (trainIdx != -1)
             {
-                int _imgIdx = *imgIdx_ptr;
+                int imgIdx = *imgIdx_ptr;
 
-                float _distance = *distance_ptr;
+                float distance = *distance_ptr;
 
-                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
+                DMatch m(queryIdx, trainIdx, imgIdx, distance);
 
                 curMatches.push_back(m);
             }
@@ -709,7 +696,7 @@ namespace
     };
 }
 
-void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, vector< vector<DMatch> >& matches, int k,
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& query, vector< vector<DMatch> >& matches, int k,
     const vector<GpuMat>& masks, bool compactResult)
 {
     if (k == 2)
@@ -767,7 +754,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, vector< vector<DMatch
 ////////////////////////////////////////////////////////////////////
 // RadiusMatch
 
-void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
     GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
     const GpuMat& mask, Stream& stream)
 {
@@ -780,23 +767,23 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        }
     };
 
     DeviceInfo info;
@@ -811,9 +798,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(train.type() == query.type() && train.cols == query.cols);
     CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
     if (trainIdx.empty())
@@ -827,13 +811,13 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
     else
         nMatches.setTo(Scalar::all(0));
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || distance.empty() || nMatches.empty())
@@ -846,7 +830,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const G
     radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || distance.empty() || nMatches.empty())
@@ -868,25 +852,25 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
         const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
         const float* distance_ptr = distance.ptr<float>(queryIdx);
 
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
 
-        if (nMatched == 0)
+        if (nMatches == 0)
         {
             if (!compactResult)
                 matches.push_back(vector<DMatch>());
             continue;
         }
 
-        matches.push_back(vector<DMatch>(nMatched));
+        matches.push_back(vector<DMatch>(nMatches));
         vector<DMatch>& curMatches = matches.back();
 
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
         {
-            int _trainIdx = *trainIdx_ptr;
+            int trainIdx = *trainIdx_ptr;
 
-            float _distance = *distance_ptr;
+            float distance = *distance_ptr;
 
-            DMatch m(queryIdx, _trainIdx, 0, _distance);
+            DMatch m(queryIdx, trainIdx, 0, distance);
 
             curMatches[i] = m;
         }
@@ -895,7 +879,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
     }
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat& query, const GpuMat& train,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const GpuMat& train,
     vector< vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)
 {
     GpuMat trainIdx, distance, nMatches;
@@ -903,7 +887,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat& query, const GpuMat& trai
     radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
     float maxDistance, const vector<GpuMat>& masks, Stream& stream)
 {
     if (query.empty() || empty())
@@ -915,23 +899,23 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                              int cc, cudaStream_t stream);
 
-    static const caller_t callersL1[] =
+    static const caller_t callers[3][6] =
     {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        }
     };
 
     DeviceInfo info;
@@ -944,9 +928,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
 
     ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
     if (trainIdx.empty())
@@ -961,7 +942,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
     else
         nMatches.setTo(Scalar::all(0));
 
-    caller_t func = callers[query.depth()];
+    caller_t func = callers[distType][query.depth()];
     CV_Assert(func != 0);
 
     vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
@@ -971,7 +952,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
         trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
@@ -985,7 +966,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const G
     radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
     vector< vector<DMatch> >& matches, bool compactResult)
 {
     if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
@@ -1009,9 +990,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
         const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
         const float* distance_ptr = distance.ptr<float>(queryIdx);
 
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
 
-        if (nMatched == 0)
+        if (nMatches == 0)
         {
             if (!compactResult)
                 matches.push_back(vector<DMatch>());
@@ -1020,9 +1001,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
 
         matches.push_back(vector<DMatch>());
         vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatched);
+        curMatches.reserve(nMatches);
 
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
         {
             int _trainIdx = *trainIdx_ptr;
             int _imgIdx = *imgIdx_ptr;
@@ -1037,7 +1018,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
     }
 }
 
-void cv::gpu::BFMatcher_GPU::radiusMatch(const GpuMat& query, vector< vector<DMatch> >& matches,
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, vector< vector<DMatch> >& matches,
     float maxDistance, const vector<GpuMat>& masks, bool compactResult)
 {
     GpuMat trainIdx, imgIdx, distance, nMatches;
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 07e174e5cf..f2aa1a4b79 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -58,7 +58,6 @@ bool cv::gpu::CascadeClassifier_GPU::load(const string&)              { throw_no
 Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const        { throw_nogpu(); return Size();}
 void cv::gpu::CascadeClassifier_GPU::release()                        { throw_nogpu(); }
 int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_nogpu(); return -1;}
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_nogpu(); return -1;}
 
 #else
 
@@ -683,12 +682,6 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
     return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
 }
 
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
-{
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
-}
-
 bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)
 {
     release();
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 4f282b5051..3f5f55b99a 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -64,7 +64,6 @@ void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -1435,46 +1434,6 @@ namespace
     }
 }
 
-void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
-{
-    using namespace cv::gpu::device;
-
-    typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[7][6] =
-    {
-        {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
-        {compare_eq<signed char>   , compare_gt<signed char>   , compare_ge<signed char>   , compare_lt<signed char>   , compare_le<signed char>   , compare_ne<signed char>   },
-        {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>},
-        {compare_eq<short>         , compare_gt<short>         , compare_ge<short>         , compare_lt<short>         , compare_le<short>         , compare_ne<short>         },
-        {compare_eq<int>           , compare_gt<int>           , compare_ge<int>           , compare_lt<int>           , compare_le<int>           , compare_ne<int>           },
-        {compare_eq<float>         , compare_gt<float>         , compare_ge<float>         , compare_lt<float>         , compare_le<float>         , compare_ne<float>         },
-        {compare_eq<double>        , compare_gt<double>        , compare_ge<double>        , compare_lt<double>        , compare_le<double>        , compare_ne<double>        }
-    };
-
-    typedef void (*cast_func_t)(Scalar& sc);
-    static const cast_func_t cast_func[] =
-    {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
-    };
-
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
-
-    if (src.depth() == CV_64F)
-    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels()));
-
-    cast_func[src.depth()](sc);
-
-    funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream));
-}
-
-
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index 6e2170d114..3df62cf5e9 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -119,7 +119,9 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, f
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+    cudaDeviceProp prop;
+    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
 
diff --git a/modules/gpu/src/orb.cpp b/modules/gpu/src/orb.cpp
index 299806c493..f9bc71c525 100644
--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@@ -53,8 +53,8 @@ void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyP
 void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::ORB_GPU::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::ORB_GPU::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::downloadKeyPoints(GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::convertKeyPoints(Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
 void cv::gpu::ORB_GPU::release() { throw_nogpu(); }
 void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::ORB_GPU::computeKeyPointsPyramid() { throw_nogpu(); }
@@ -685,7 +685,7 @@ void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat& keypoints)
     }
 }
 
-void cv::gpu::ORB_GPU::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
+void cv::gpu::ORB_GPU::downloadKeyPoints(GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
 {
     if (d_keypoints.empty())
     {
@@ -698,7 +698,7 @@ void cv::gpu::ORB_GPU::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<
     convertKeyPoints(h_keypoints, keypoints);
 }
 
-void cv::gpu::ORB_GPU::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
+void cv::gpu::ORB_GPU::convertKeyPoints(Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
 {
     if (d_keypoints.empty())
     {
diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp
index dc874e643e..d11bd0b0c3 100644
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -48,10 +48,8 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_nogpu(); }
 void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_nogpu(); }
 void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_nogpu(); }
-void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -71,15 +69,6 @@ namespace cv { namespace gpu { namespace device
     }
 }}}
 
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
-    isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
-}
-
 namespace
 {
     void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch, bool isDeviceArch11)
@@ -153,11 +142,11 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
     }
     else
     {
-        cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
+        cvtColor(prevImg, dx_calcBuf_, COLOR_BGR2BGRA);
+        dx_calcBuf_.convertTo(prevPyr_[0], CV_32F);
 
-        cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
+        cvtColor(nextImg, dx_calcBuf_, COLOR_BGR2BGRA);
+        dx_calcBuf_.convertTo(nextPyr_[0], CV_32F);
     }
 
     for (int level = 1; level <= maxLevel; ++level)
@@ -240,18 +229,4 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
     vPyr_[idx].copyTo(v);
 }
 
-void cv::gpu::PyrLKOpticalFlow::releaseMemory()
-{
-    prevPyr_.clear();
-    nextPyr_.clear();
-
-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
-
-    uPyr_[1].release();
-    vPyr_[1].release();
-}
-
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/test/main.cpp b/modules/gpu/test/main.cpp
index 12f93d23a4..630a34c099 100644
--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@@ -118,7 +118,7 @@ int main(int argc, char** argv)
 {
     try
     {
-        const std::string keys =
+         const char*  keys =
                 "{ h help ?            |      | Print help}"
                 "{ i info              |      | Print information about system and exit }"
                 "{ device              | -1   | Device on which tests will be executed (-1 means all devices) }"
@@ -127,16 +127,16 @@ int main(int argc, char** argv)
 
         CommandLineParser cmd(argc, (const char**)argv, keys);
 
-        if (cmd.has("help"))
+        if (cmd.get<bool>("help"))
         {
-            cmd.printMessage();
+            cmd.printParams();
             return 0;
         }
 
         printOsInfo();
         printCudaInfo();
 
-        if (cmd.has("info"))
+        if (cmd.get<bool>("info"))
         {
             return 0;
         }
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index 09c6be1ac3..66eff041f6 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1543,117 +1543,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Array, testing::Combine(
     ALL_CMP_CODES,
     WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// Compare_Scalar
-
-namespace
-{
-    template <template <typename> class Op, typename T>
-    void compareScalarImpl(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst)
-    {
-        Op<T> op;
-
-        const int cn = src.channels();
-
-        dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
-
-        for (int y = 0; y < src.rows; ++y)
-        {
-            for (int x = 0; x < src.cols; ++x)
-            {
-                for (int c = 0; c < cn; ++c)
-                {
-                    T src_val = src.at<T>(y, x * cn + c);
-                    T sc_val = cv::saturate_cast<T>(sc.val[c]);
-                    dst.at<uchar>(y, x * cn + c) = static_cast<uchar>(static_cast<int>(op(src_val, sc_val)) * 255);
-                }
-            }
-        }
-    }
-
-    void compareScalarGold(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst, int cmpop)
-    {
-        typedef void (*func_t)(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst);
-        static const func_t funcs[7][6] =
-        {
-            {compareScalarImpl<std::equal_to, unsigned char> , compareScalarImpl<std::greater, unsigned char> , compareScalarImpl<std::greater_equal, unsigned char> , compareScalarImpl<std::less, unsigned char> , compareScalarImpl<std::less_equal, unsigned char> , compareScalarImpl<std::not_equal_to, unsigned char> },
-            {compareScalarImpl<std::equal_to, signed char>   , compareScalarImpl<std::greater, signed char>   , compareScalarImpl<std::greater_equal, signed char>   , compareScalarImpl<std::less, signed char>   , compareScalarImpl<std::less_equal, signed char>   , compareScalarImpl<std::not_equal_to, signed char>   },
-            {compareScalarImpl<std::equal_to, unsigned short>, compareScalarImpl<std::greater, unsigned short>, compareScalarImpl<std::greater_equal, unsigned short>, compareScalarImpl<std::less, unsigned short>, compareScalarImpl<std::less_equal, unsigned short>, compareScalarImpl<std::not_equal_to, unsigned short>},
-            {compareScalarImpl<std::equal_to, short>         , compareScalarImpl<std::greater, short>         , compareScalarImpl<std::greater_equal, short>         , compareScalarImpl<std::less, short>         , compareScalarImpl<std::less_equal, short>         , compareScalarImpl<std::not_equal_to, short>         },
-            {compareScalarImpl<std::equal_to, int>           , compareScalarImpl<std::greater, int>           , compareScalarImpl<std::greater_equal, int>           , compareScalarImpl<std::less, int>           , compareScalarImpl<std::less_equal, int>           , compareScalarImpl<std::not_equal_to, int>           },
-            {compareScalarImpl<std::equal_to, float>         , compareScalarImpl<std::greater, float>         , compareScalarImpl<std::greater_equal, float>         , compareScalarImpl<std::less, float>         , compareScalarImpl<std::less_equal, float>         , compareScalarImpl<std::not_equal_to, float>         },
-            {compareScalarImpl<std::equal_to, double>        , compareScalarImpl<std::greater, double>        , compareScalarImpl<std::greater_equal, double>        , compareScalarImpl<std::less, double>        , compareScalarImpl<std::less_equal, double>        , compareScalarImpl<std::not_equal_to, double>        }
-        };
-
-        funcs[src.depth()][cmpop](src, sc, dst);
-    }
-}
-
-PARAM_TEST_CASE(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatType, CmpCode, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int cmp_code;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        cmp_code = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-TEST_P(Compare_Scalar, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Scalar sc = randomScalar(0.0, 255.0);
-
-    if (src.depth() < CV_32F)
-    {
-        sc.val[0] = cvRound(sc.val[0]);
-        sc.val[1] = cvRound(sc.val[1]);
-        sc.val[2] = cvRound(sc.val[2]);
-        sc.val[3] = cvRound(sc.val[3]);
-    }
-
-    if (src.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::compare(loadMat(src), sc, dst, cmp_code);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(CV_8U, src.channels()), useRoi);
-
-        cv::gpu::compare(loadMat(src, useRoi), sc, dst, cmp_code);
-
-        cv::Mat dst_gold;
-        compareScalarGold(src, sc, dst_gold, cmp_code);
-
-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    TYPES(CV_8U, CV_64F, 1, 4),
-    ALL_CMP_CODES,
-    WHOLE_SUBMAT));
-
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array
 
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index b461d4b441..bea6e81b17 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -570,7 +570,8 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::gpu::DeviceInfo, NormCode, DescriptorSize
 
 TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     cv::gpu::GpuMat mask;
     if (useMask)
@@ -597,7 +598,8 @@ TEST_P(BruteForceMatcher, Match_Single)
 
 TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     cv::gpu::GpuMat d_train(train);
 
@@ -651,7 +653,8 @@ TEST_P(BruteForceMatcher, Match_Collection)
 
 TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const int knn = 2;
 
@@ -690,7 +693,8 @@ TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 
 TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const int knn = 3;
 
@@ -729,7 +733,8 @@ TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 
 TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const int knn = 2;
 
@@ -791,7 +796,8 @@ TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 
 TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const int knn = 3;
 
@@ -853,7 +859,8 @@ TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 
 TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const float radius = 1.f / countFactor;
 
@@ -902,7 +909,8 @@ TEST_P(BruteForceMatcher, RadiusMatch_Single)
 
 TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::gpu::BFMatcher_GPU matcher(normCode);
+    cv::gpu::BruteForceMatcher_GPU_base matcher(
+                        cv::gpu::BruteForceMatcher_GPU_base::DistType((normCode -2) / 2));
 
     const int n = 3;
     const float radius = 1.f / countFactor * n;
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 6664db0d21..c8dc4ca30d 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -219,7 +219,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     descriptors1_.upload(features1.descriptors);
     descriptors2_.upload(features2.descriptors);
 
-    BFMatcher_GPU matcher(NORM_L2);
+    BruteForceMatcher_GPU_base matcher(BruteForceMatcher_GPU_base::L2Dist);
     MatchesSet matches;
 
     // Find 1->2 matches